/* * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) * Copyright (C) $$Year-Rel$$ The Jalview Authors * * This file is part of Jalview. * * Jalview is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * Jalview is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Jalview. If not, see . * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.analysis; import jalview.datamodel.AlignedCodon; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.FeatureProperties; import jalview.datamodel.IncompleteCodonException; import jalview.datamodel.Mapping; import jalview.datamodel.SearchResults; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceGroup; import jalview.datamodel.SequenceI; import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyI; import jalview.schemes.ResidueProperties; import jalview.util.Comparison; import jalview.util.DBRefUtils; import jalview.util.MapList; import jalview.util.MappingUtils; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.TreeMap; /** * grab bag of useful alignment manipulation operations Expect these to be * refactored elsewhere at some point. * * @author jimp * */ public class AlignmentUtils { /** * given an existing alignment, create a new alignment including all, or up to * flankSize additional symbols from each sequence's dataset sequence * * @param core * @param flankSize * @return AlignmentI */ public static AlignmentI expandContext(AlignmentI core, int flankSize) { List sq = new ArrayList(); int maxoffset = 0; for (SequenceI s : core.getSequences()) { SequenceI newSeq = s.deriveSequence(); final int newSeqStart = newSeq.getStart() - 1; if (newSeqStart > maxoffset && newSeq.getDatasetSequence().getStart() < s.getStart()) { maxoffset = newSeqStart; } sq.add(newSeq); } if (flankSize > -1) { maxoffset = Math.min(maxoffset, flankSize); } /* * now add offset left and right to create an expanded alignment */ for (SequenceI s : sq) { SequenceI ds = s; while (ds.getDatasetSequence() != null) { ds = ds.getDatasetSequence(); } int s_end = s.findPosition(s.getStart() + s.getLength()); // find available flanking residues for sequence int ustream_ds = s.getStart() - ds.getStart(); int dstream_ds = ds.getEnd() - s_end; // build new flanked sequence // compute gap padding to start of flanking sequence int offset = maxoffset - ustream_ds; // padding is gapChar x ( maxoffset - min(ustream_ds, flank) if (flankSize >= 0) { if (flankSize < ustream_ds) { // take up to flankSize residues offset = maxoffset - flankSize; ustream_ds = flankSize; } if (flankSize <= dstream_ds) { dstream_ds = flankSize - 1; } } // TODO use Character.toLowerCase to avoid creating String objects? char[] upstream = new String(ds.getSequence(s.getStart() - 1 - ustream_ds, s.getStart() - 1)).toLowerCase().toCharArray(); char[] downstream = new String(ds.getSequence(s_end - 1, s_end + dstream_ds)).toLowerCase().toCharArray(); char[] coreseq = s.getSequence(); char[] nseq = new char[offset + upstream.length + downstream.length + coreseq.length]; char c = core.getGapCharacter(); int p = 0; for (; p < offset; p++) { nseq[p] = c; } System.arraycopy(upstream, 0, nseq, p, upstream.length); System.arraycopy(coreseq, 0, nseq, p + upstream.length, coreseq.length); System.arraycopy(downstream, 0, nseq, p + coreseq.length + upstream.length, downstream.length); s.setSequence(new String(nseq)); s.setStart(s.getStart() - ustream_ds); s.setEnd(s_end + downstream.length); } AlignmentI newAl = new jalview.datamodel.Alignment( sq.toArray(new SequenceI[0])); for (SequenceI s : sq) { if (s.getAnnotation() != null) { for (AlignmentAnnotation aa : s.getAnnotation()) { aa.adjustForAlignment(); // JAL-1712 fix newAl.addAnnotation(aa); } } } newAl.setDataset(core.getDataset()); return newAl; } /** * Returns the index (zero-based position) of a sequence in an alignment, or * -1 if not found. * * @param al * @param seq * @return */ public static int getSequenceIndex(AlignmentI al, SequenceI seq) { int result = -1; int pos = 0; for (SequenceI alSeq : al.getSequences()) { if (alSeq == seq) { result = pos; break; } pos++; } return result; } /** * Returns a map of lists of sequences in the alignment, keyed by sequence * name. For use in mapping between different alignment views of the same * sequences. * * @see jalview.datamodel.AlignmentI#getSequencesByName() */ public static Map> getSequencesByName( AlignmentI al) { Map> theMap = new LinkedHashMap>(); for (SequenceI seq : al.getSequences()) { String name = seq.getName(); if (name != null) { List seqs = theMap.get(name); if (seqs == null) { seqs = new ArrayList(); theMap.put(name, seqs); } seqs.add(seq); } } return theMap; } /** * Build mapping of protein to cDNA alignment. Mappings are made between * sequences where the cDNA translates to the protein sequence. Any new * mappings are added to the protein alignment. Returns true if any mappings * either already exist or were added, else false. * * @param proteinAlignment * @param cdnaAlignment * @return */ public static boolean mapProteinAlignmentToCdna( final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment) { if (proteinAlignment == null || cdnaAlignment == null) { return false; } Set mappedDna = new HashSet(); Set mappedProtein = new HashSet(); /* * First pass - map sequences where cross-references exist. This include * 1-to-many mappings to support, for example, variant cDNA. */ boolean mappingPerformed = mapProteinToCdna(proteinAlignment, cdnaAlignment, mappedDna, mappedProtein, true); /* * Second pass - map sequences where no cross-references exist. This only * does 1-to-1 mappings and assumes corresponding sequences are in the same * order in the alignments. */ mappingPerformed |= mapProteinToCdna(proteinAlignment, cdnaAlignment, mappedDna, mappedProtein, false); return mappingPerformed; } /** * Make mappings between compatible sequences (where the cDNA translation * matches the protein). * * @param proteinAlignment * @param cdnaAlignment * @param mappedDna * a set of mapped DNA sequences (to add to) * @param mappedProtein * a set of mapped Protein sequences (to add to) * @param xrefsOnly * if true, only map sequences where xrefs exist * @return */ protected static boolean mapProteinToCdna( final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment, Set mappedDna, Set mappedProtein, boolean xrefsOnly) { boolean mappingExistsOrAdded = false; List thisSeqs = proteinAlignment.getSequences(); for (SequenceI aaSeq : thisSeqs) { boolean proteinMapped = false; AlignedCodonFrame acf = new AlignedCodonFrame(); for (SequenceI cdnaSeq : cdnaAlignment.getSequences()) { /* * Always try to map if sequences have xref to each other; this supports * variant cDNA or alternative splicing for a protein sequence. * * If no xrefs, try to map progressively, assuming that alignments have * mappable sequences in corresponding order. These are not * many-to-many, as that would risk mixing species with similar cDNA * sequences. */ if (xrefsOnly && !AlignmentUtils.haveCrossRef(aaSeq, cdnaSeq)) { continue; } /* * Don't map non-xrefd sequences more than once each. This heuristic * allows us to pair up similar sequences in ordered alignments. */ if (!xrefsOnly && (mappedProtein.contains(aaSeq) || mappedDna .contains(cdnaSeq))) { continue; } if (mappingExists(proteinAlignment.getCodonFrames(), aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence())) { mappingExistsOrAdded = true; } else { MapList map = mapProteinSequenceToCdna(aaSeq, cdnaSeq); if (map != null) { acf.addMap(cdnaSeq, aaSeq, map); mappingExistsOrAdded = true; proteinMapped = true; mappedDna.add(cdnaSeq); mappedProtein.add(aaSeq); } } } if (proteinMapped) { proteinAlignment.addCodonFrame(acf); } } return mappingExistsOrAdded; } /** * Answers true if the mappings include one between the given (dataset) * sequences. */ public static boolean mappingExists(List mappings, SequenceI aaSeq, SequenceI cdnaSeq) { if (mappings != null) { for (AlignedCodonFrame acf : mappings) { if (cdnaSeq == acf.getDnaForAaSeq(aaSeq)) { return true; } } } return false; } /** * Build a mapping (if possible) of a protein to a cDNA sequence. The cDNA * must be three times the length of the protein, possibly after ignoring * start and/or stop codons, and must translate to the protein. Returns null * if no mapping is determined. * * @param proteinSeqs * @param cdnaSeq * @return */ public static MapList mapProteinSequenceToCdna(SequenceI proteinSeq, SequenceI cdnaSeq) { /* * Here we handle either dataset sequence set (desktop) or absent (applet). * Use only the char[] form of the sequence to avoid creating possibly large * String objects. */ final SequenceI proteinDataset = proteinSeq.getDatasetSequence(); char[] aaSeqChars = proteinDataset != null ? proteinDataset .getSequence() : proteinSeq.getSequence(); final SequenceI cdnaDataset = cdnaSeq.getDatasetSequence(); char[] cdnaSeqChars = cdnaDataset != null ? cdnaDataset.getSequence() : cdnaSeq.getSequence(); if (aaSeqChars == null || cdnaSeqChars == null) { return null; } /* * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping) */ final int mappedLength = 3 * aaSeqChars.length; int cdnaLength = cdnaSeqChars.length; int cdnaStart = cdnaSeq.getStart(); int cdnaEnd = cdnaSeq.getEnd(); final int proteinStart = proteinSeq.getStart(); final int proteinEnd = proteinSeq.getEnd(); /* * If lengths don't match, try ignoring stop codon. */ if (cdnaLength != mappedLength && cdnaLength > 2) { String lastCodon = String.valueOf(cdnaSeqChars, cdnaLength - 3, 3) .toUpperCase(); for (String stop : ResidueProperties.STOP) { if (lastCodon.equals(stop)) { cdnaEnd -= 3; cdnaLength -= 3; break; } } } /* * If lengths still don't match, try ignoring start codon. */ int startOffset = 0; if (cdnaLength != mappedLength && cdnaLength > 2 && String.valueOf(cdnaSeqChars, 0, 3).toUpperCase() .equals(ResidueProperties.START)) { startOffset += 3; cdnaStart += 3; cdnaLength -= 3; } if (cdnaLength != mappedLength) { return null; } if (!translatesAs(cdnaSeqChars, startOffset, aaSeqChars)) { return null; } MapList map = new MapList(new int[] { cdnaStart, cdnaEnd }, new int[] { proteinStart, proteinEnd }, 3, 1); return map; } /** * Test whether the given cdna sequence, starting at the given offset, * translates to the given amino acid sequence, using the standard translation * table. Designed to fail fast i.e. as soon as a mismatch position is found. * * @param cdnaSeqChars * @param cdnaStart * @param aaSeqChars * @return */ protected static boolean translatesAs(char[] cdnaSeqChars, int cdnaStart, char[] aaSeqChars) { if (cdnaSeqChars == null || aaSeqChars == null) { return false; } int aaResidue = 0; for (int i = cdnaStart; i < cdnaSeqChars.length - 2 && aaResidue < aaSeqChars.length; i += 3, aaResidue++) { String codon = String.valueOf(cdnaSeqChars, i, 3); final String translated = ResidueProperties.codonTranslate(codon); /* * allow * in protein to match untranslatable in dna */ final char aaRes = aaSeqChars[aaResidue]; if ((translated == null || "STOP".equals(translated)) && aaRes == '*') { continue; } if (translated == null || !(aaRes == translated.charAt(0))) { // debug // System.out.println(("Mismatch at " + i + "/" + aaResidue + ": " // + codon + "(" + translated + ") != " + aaRes)); return false; } } // fail if we didn't match all of the aa sequence return (aaResidue == aaSeqChars.length); } /** * Align sequence 'seq' to match the alignment of a mapped sequence. Note this * currently assumes that we are aligning cDNA to match protein. * * @param seq * the sequence to be realigned * @param al * the alignment whose sequence alignment is to be 'copied' * @param gap * character string represent a gap in the realigned sequence * @param preserveUnmappedGaps * @param preserveMappedGaps * @return true if the sequence was realigned, false if it could not be */ public static boolean alignSequenceAs(SequenceI seq, AlignmentI al, String gap, boolean preserveMappedGaps, boolean preserveUnmappedGaps) { /* * Get any mappings from the source alignment to the target (dataset) * sequence. */ // TODO there may be one AlignedCodonFrame per dataset sequence, or one with // all mappings. Would it help to constrain this? List mappings = al.getCodonFrame(seq); if (mappings == null || mappings.isEmpty()) { return false; } /* * Locate the aligned source sequence whose dataset sequence is mapped. We * just take the first match here (as we can't align like more than one * sequence). */ SequenceI alignFrom = null; AlignedCodonFrame mapping = null; for (AlignedCodonFrame mp : mappings) { alignFrom = mp.findAlignedSequence(seq.getDatasetSequence(), al); if (alignFrom != null) { mapping = mp; break; } } if (alignFrom == null) { return false; } alignSequenceAs(seq, alignFrom, mapping, gap, al.getGapCharacter(), preserveMappedGaps, preserveUnmappedGaps); return true; } /** * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to * match residues and codons. Flags control whether existing gaps in unmapped * (intron) and mapped (exon) regions are preserved or not. Gaps between * intron and exon are only retained if both flags are set. * * @param alignTo * @param alignFrom * @param mapping * @param myGap * @param sourceGap * @param preserveUnmappedGaps * @param preserveMappedGaps */ public static void alignSequenceAs(SequenceI alignTo, SequenceI alignFrom, AlignedCodonFrame mapping, String myGap, char sourceGap, boolean preserveMappedGaps, boolean preserveUnmappedGaps) { // TODO generalise to work for Protein-Protein, dna-dna, dna-protein // aligned and dataset sequence positions, all base zero int thisSeqPos = 0; int sourceDsPos = 0; int basesWritten = 0; char myGapChar = myGap.charAt(0); int ratio = myGap.length(); int fromOffset = alignFrom.getStart() - 1; int toOffset = alignTo.getStart() - 1; int sourceGapMappedLength = 0; boolean inExon = false; final char[] thisSeq = alignTo.getSequence(); final char[] thatAligned = alignFrom.getSequence(); StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length); /* * Traverse the 'model' aligned sequence */ for (char sourceChar : thatAligned) { if (sourceChar == sourceGap) { sourceGapMappedLength += ratio; continue; } /* * Found a non-gap character. Locate its mapped region if any. */ sourceDsPos++; // Note mapping positions are base 1, our sequence positions base 0 int[] mappedPos = mapping.getMappedRegion(alignTo, alignFrom, sourceDsPos + fromOffset); if (mappedPos == null) { /* * unmapped position; treat like a gap */ sourceGapMappedLength += ratio; // System.err.println("Can't align: no codon mapping to residue " // + sourceDsPos + "(" + sourceChar + ")"); // return; continue; } int mappedCodonStart = mappedPos[0]; // position (1...) of codon start int mappedCodonEnd = mappedPos[mappedPos.length - 1]; // codon end pos StringBuilder trailingCopiedGap = new StringBuilder(); /* * Copy dna sequence up to and including this codon. Optionally, include * gaps before the codon starts (in introns) and/or after the codon starts * (in exons). * * Note this only works for 'linear' splicing, not reverse or interleaved. * But then 'align dna as protein' doesn't make much sense otherwise. */ int intronLength = 0; while (basesWritten + toOffset < mappedCodonEnd && thisSeqPos < thisSeq.length) { final char c = thisSeq[thisSeqPos++]; if (c != myGapChar) { basesWritten++; int sourcePosition = basesWritten + toOffset; if (sourcePosition < mappedCodonStart) { /* * Found an unmapped (intron) base. First add in any preceding gaps * (if wanted). */ if (preserveUnmappedGaps && trailingCopiedGap.length() > 0) { thisAligned.append(trailingCopiedGap.toString()); intronLength += trailingCopiedGap.length(); trailingCopiedGap = new StringBuilder(); } intronLength++; inExon = false; } else { final boolean startOfCodon = sourcePosition == mappedCodonStart; int gapsToAdd = calculateGapsToInsert(preserveMappedGaps, preserveUnmappedGaps, sourceGapMappedLength, inExon, trailingCopiedGap.length(), intronLength, startOfCodon); for (int i = 0; i < gapsToAdd; i++) { thisAligned.append(myGapChar); } sourceGapMappedLength = 0; inExon = true; } thisAligned.append(c); trailingCopiedGap = new StringBuilder(); } else { if (inExon && preserveMappedGaps) { trailingCopiedGap.append(myGapChar); } else if (!inExon && preserveUnmappedGaps) { trailingCopiedGap.append(myGapChar); } } } } /* * At end of model aligned sequence. Copy any remaining target sequence, optionally * including (intron) gaps. */ while (thisSeqPos < thisSeq.length) { final char c = thisSeq[thisSeqPos++]; if (c != myGapChar || preserveUnmappedGaps) { thisAligned.append(c); } sourceGapMappedLength--; } /* * finally add gaps to pad for any trailing source gaps or * unmapped characters */ if (preserveUnmappedGaps) { while (sourceGapMappedLength > 0) { thisAligned.append(myGapChar); sourceGapMappedLength--; } } /* * All done aligning, set the aligned sequence. */ alignTo.setSequence(new String(thisAligned)); } /** * Helper method to work out how many gaps to insert when realigning. * * @param preserveMappedGaps * @param preserveUnmappedGaps * @param sourceGapMappedLength * @param inExon * @param trailingCopiedGap * @param intronLength * @param startOfCodon * @return */ protected static int calculateGapsToInsert(boolean preserveMappedGaps, boolean preserveUnmappedGaps, int sourceGapMappedLength, boolean inExon, int trailingGapLength, int intronLength, final boolean startOfCodon) { int gapsToAdd = 0; if (startOfCodon) { /* * Reached start of codon. Ignore trailing gaps in intron unless we are * preserving gaps in both exon and intron. Ignore them anyway if the * protein alignment introduces a gap at least as large as the intronic * region. */ if (inExon && !preserveMappedGaps) { trailingGapLength = 0; } if (!inExon && !(preserveMappedGaps && preserveUnmappedGaps)) { trailingGapLength = 0; } if (inExon) { gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength); } else { if (intronLength + trailingGapLength <= sourceGapMappedLength) { gapsToAdd = sourceGapMappedLength - intronLength; } else { gapsToAdd = Math.min(intronLength + trailingGapLength - sourceGapMappedLength, trailingGapLength); } } } else { /* * second or third base of codon; check for any gaps in dna */ if (!preserveMappedGaps) { trailingGapLength = 0; } gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength); } return gapsToAdd; } /** * Returns a list of sequences mapped from the given sequences and aligned * (gapped) in the same way. For example, the cDNA for aligned protein, where * a single gap in protein generates three gaps in cDNA. * * @param sequences * @param gapCharacter * @param mappings * @return */ public static List getAlignedTranslation( List sequences, char gapCharacter, Set mappings) { List alignedSeqs = new ArrayList(); for (SequenceI seq : sequences) { List mapped = getAlignedTranslation(seq, gapCharacter, mappings); alignedSeqs.addAll(mapped); } return alignedSeqs; } /** * Returns sequences aligned 'like' the source sequence, as mapped by the * given mappings. Normally we expect zero or one 'mapped' sequences, but this * will support 1-to-many as well. * * @param seq * @param gapCharacter * @param mappings * @return */ protected static List getAlignedTranslation(SequenceI seq, char gapCharacter, Set mappings) { List result = new ArrayList(); for (AlignedCodonFrame mapping : mappings) { if (mapping.involvesSequence(seq)) { SequenceI mapped = getAlignedTranslation(seq, gapCharacter, mapping); if (mapped != null) { result.add(mapped); } } } return result; } /** * Returns the translation of 'seq' (as held in the mapping) with * corresponding alignment (gaps). * * @param seq * @param gapCharacter * @param mapping * @return */ protected static SequenceI getAlignedTranslation(SequenceI seq, char gapCharacter, AlignedCodonFrame mapping) { String gap = String.valueOf(gapCharacter); boolean toDna = false; int fromRatio = 1; SequenceI mapTo = mapping.getDnaForAaSeq(seq); if (mapTo != null) { // mapping is from protein to nucleotide toDna = true; // should ideally get gap count ratio from mapping gap = String.valueOf(new char[] { gapCharacter, gapCharacter, gapCharacter }); } else { // mapping is from nucleotide to protein mapTo = mapping.getAaForDnaSeq(seq); fromRatio = 3; } StringBuilder newseq = new StringBuilder(seq.getLength() * (toDna ? 3 : 1)); int residueNo = 0; // in seq, base 1 int[] phrase = new int[fromRatio]; int phraseOffset = 0; int gapWidth = 0; boolean first = true; final Sequence alignedSeq = new Sequence("", ""); for (char c : seq.getSequence()) { if (c == gapCharacter) { gapWidth++; if (gapWidth >= fromRatio) { newseq.append(gap); gapWidth = 0; } } else { phrase[phraseOffset++] = residueNo + 1; if (phraseOffset == fromRatio) { /* * Have read a whole codon (or protein residue), now translate: map * source phrase to positions in target sequence add characters at * these positions to newseq Note mapping positions are base 1, our * sequence positions base 0. */ SearchResults sr = new SearchResults(); for (int pos : phrase) { mapping.markMappedRegion(seq, pos, sr); } newseq.append(sr.getCharacters()); if (first) { first = false; // Hack: Copy sequence dataset, name and description from // SearchResults.match[0].sequence // TODO? carry over sequence names from original 'complement' // alignment SequenceI mappedTo = sr.getResultSequence(0); alignedSeq.setName(mappedTo.getName()); alignedSeq.setDescription(mappedTo.getDescription()); alignedSeq.setDatasetSequence(mappedTo); } phraseOffset = 0; } residueNo++; } } alignedSeq.setSequence(newseq.toString()); return alignedSeq; } /** * Realigns the given protein to match the alignment of the dna, using codon * mappings to translate aligned codon positions to protein residues. * * @param protein * the alignment whose sequences are realigned by this method * @param dna * the dna alignment whose alignment we are 'copying' * @return the number of sequences that were realigned */ public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna) { List unmappedProtein = new ArrayList(); Map> alignedCodons = buildCodonColumnsMap( protein, dna, unmappedProtein); return alignProteinAs(protein, alignedCodons, unmappedProtein); } /** * Builds a map whose key is an aligned codon position (3 alignment column * numbers base 0), and whose value is a map from protein sequence to each * protein's peptide residue for that codon. The map generates an ordering of * the codons, and allows us to read off the peptides at each position in * order to assemble 'aligned' protein sequences. * * @param protein * the protein alignment * @param dna * the coding dna alignment * @param unmappedProtein * any unmapped proteins are added to this list * @return */ protected static Map> buildCodonColumnsMap( AlignmentI protein, AlignmentI dna, List unmappedProtein) { /* * maintain a list of any proteins with no mappings - these will be * rendered 'as is' in the protein alignment as we can't align them */ unmappedProtein.addAll(protein.getSequences()); List mappings = protein.getCodonFrames(); /* * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of * {dnaSequence, {proteinSequence, codonProduct}} at that position. The * comparator keeps the codon positions ordered. */ Map> alignedCodons = new TreeMap>( new CodonComparator()); for (SequenceI dnaSeq : dna.getSequences()) { for (AlignedCodonFrame mapping : mappings) { SequenceI prot = mapping.findAlignedSequence( dnaSeq.getDatasetSequence(), protein); if (prot != null) { Mapping seqMap = mapping.getMappingForSequence(dnaSeq); addCodonPositions(dnaSeq, prot, protein.getGapCharacter(), seqMap, alignedCodons); unmappedProtein.remove(prot); } } } /* * Finally add any unmapped peptide start residues (e.g. for incomplete * codons) as if at the codon position before the second residue */ int mappedSequenceCount = protein.getHeight() - unmappedProtein.size(); addUnmappedPeptideStarts(alignedCodons, mappedSequenceCount); return alignedCodons; } /** * Scans for any protein mapped from position 2 (meaning unmapped start * position e.g. an incomplete codon), and synthesizes a 'codon' for it at the * preceding position in the alignment * * @param alignedCodons * the codon-to-peptide map * @param mappedSequenceCount * the number of distinct sequences in the map */ protected static void addUnmappedPeptideStarts( Map> alignedCodons, int mappedSequenceCount) { // TODO there must be an easier way! root problem is that our mapping data // model does not include phase so can't map part of a codon to a peptide List sequencesChecked = new ArrayList(); AlignedCodon lastCodon = null; Map toAdd = new HashMap(); for (Entry> entry : alignedCodons .entrySet()) { for (Entry sequenceCodon : entry.getValue() .entrySet()) { SequenceI seq = sequenceCodon.getKey(); if (sequencesChecked.contains(seq)) { continue; } sequencesChecked.add(seq); AlignedCodon codon = sequenceCodon.getValue(); if (codon.peptideCol > 1) { System.err .println("Problem mapping protein with >1 unmapped start positions: " + seq.getName()); } else if (codon.peptideCol == 1) { /* * first position (peptideCol == 0) was unmapped - add it */ if (lastCodon != null) { AlignedCodon firstPeptide = new AlignedCodon(lastCodon.pos1, lastCodon.pos2, lastCodon.pos3, String.valueOf(seq .getCharAt(0)), 0); toAdd.put(seq, firstPeptide); } else { /* * unmapped residue at start of alignment (no prior column) - * 'insert' at nominal codon [0, 0, 0] */ AlignedCodon firstPeptide = new AlignedCodon(0, 0, 0, String.valueOf(seq.getCharAt(0)), 0); toAdd.put(seq, firstPeptide); } } if (sequencesChecked.size() == mappedSequenceCount) { // no need to check past first mapped position in all sequences break; } } lastCodon = entry.getKey(); } /* * add any new codons safely after iterating over the map */ for (Entry startCodon : toAdd.entrySet()) { addCodonToMap(alignedCodons, startCodon.getValue(), startCodon.getKey()); } } /** * Update the aligned protein sequences to match the codon alignments given in * the map. * * @param protein * @param alignedCodons * an ordered map of codon positions (columns), with sequence/peptide * values present in each column * @param unmappedProtein * @return */ protected static int alignProteinAs(AlignmentI protein, Map> alignedCodons, List unmappedProtein) { /* * Prefill aligned sequences with gaps before inserting aligned protein * residues. */ int alignedWidth = alignedCodons.size(); char[] gaps = new char[alignedWidth]; Arrays.fill(gaps, protein.getGapCharacter()); String allGaps = String.valueOf(gaps); for (SequenceI seq : protein.getSequences()) { if (!unmappedProtein.contains(seq)) { seq.setSequence(allGaps); } } int column = 0; for (AlignedCodon codon : alignedCodons.keySet()) { final Map columnResidues = alignedCodons .get(codon); for (Entry entry : columnResidues.entrySet()) { // place translated codon at its column position in sequence entry.getKey().getSequence()[column] = entry.getValue().product .charAt(0); } column++; } return 0; } /** * Populate the map of aligned codons by traversing the given sequence * mapping, locating the aligned positions of mapped codons, and adding those * positions and their translation products to the map. * * @param dna * the aligned sequence we are mapping from * @param protein * the sequence to be aligned to the codons * @param gapChar * the gap character in the dna sequence * @param seqMap * a mapping to a sequence translation * @param alignedCodons * the map we are building up */ static void addCodonPositions(SequenceI dna, SequenceI protein, char gapChar, Mapping seqMap, Map> alignedCodons) { Iterator codons = seqMap.getCodonIterator(dna, gapChar); /* * add codon positions, and their peptide translations, to the alignment * map, while remembering the first codon mapped */ while (codons.hasNext()) { try { AlignedCodon codon = codons.next(); addCodonToMap(alignedCodons, codon, protein); } catch (IncompleteCodonException e) { // possible incomplete trailing codon - ignore } } } /** * Helper method to add a codon-to-peptide entry to the aligned codons map * * @param alignedCodons * @param codon * @param protein */ protected static void addCodonToMap( Map> alignedCodons, AlignedCodon codon, SequenceI protein) { Map seqProduct = alignedCodons.get(codon); if (seqProduct == null) { seqProduct = new HashMap(); alignedCodons.put(codon, seqProduct); } seqProduct.put(protein, codon); } /** * Returns true if a cDNA/Protein mapping either exists, or could be made, * between at least one pair of sequences in the two alignments. Currently, * the logic is: *

One alignment must be nucleotide, and the other protein
At least one pair of sequences must be already mapped, or mappable
Mappable means the nucleotide translation matches the protein sequence
The translation may ignore start and stop codons if present in the * nucleotide

* * @param al1 * @param al2 * @return */ public static boolean isMappable(AlignmentI al1, AlignmentI al2) { if (al1 == null || al2 == null) { return false; } /* * Require one nucleotide and one protein */ if (al1.isNucleotide() == al2.isNucleotide()) { return false; } AlignmentI dna = al1.isNucleotide() ? al1 : al2; AlignmentI protein = dna == al1 ? al2 : al1; List mappings = protein.getCodonFrames(); for (SequenceI dnaSeq : dna.getSequences()) { for (SequenceI proteinSeq : protein.getSequences()) { if (isMappable(dnaSeq, proteinSeq, mappings)) { return true; } } } return false; } /** * Returns true if the dna sequence is mapped, or could be mapped, to the * protein sequence. * * @param dnaSeq * @param proteinSeq * @param mappings * @return */ protected static boolean isMappable(SequenceI dnaSeq, SequenceI proteinSeq, List mappings) { if (dnaSeq == null || proteinSeq == null) { return false; } SequenceI dnaDs = dnaSeq.getDatasetSequence() == null ? dnaSeq : dnaSeq .getDatasetSequence(); SequenceI proteinDs = proteinSeq.getDatasetSequence() == null ? proteinSeq : proteinSeq.getDatasetSequence(); for (AlignedCodonFrame mapping : mappings) { if (proteinDs == mapping.getAaForDnaSeq(dnaDs)) { /* * already mapped */ return true; } } /* * Just try to make a mapping (it is not yet stored), test whether * successful. */ return mapProteinSequenceToCdna(proteinDs, dnaDs) != null; } /** * Finds any reference annotations associated with the sequences in * sequenceScope, that are not already added to the alignment, and adds them * to the 'candidates' map. Also populates a lookup table of annotation * labels, keyed by calcId, for use in constructing tooltips or the like. * * @param sequenceScope * the sequences to scan for reference annotations * @param labelForCalcId * (optional) map to populate with label for calcId * @param candidates * map to populate with annotations for sequence * @param al * the alignment to check for presence of annotations */ public static void findAddableReferenceAnnotations( List sequenceScope, Map labelForCalcId, final Map> candidates, AlignmentI al) { if (sequenceScope == null) { return; } /* * For each sequence in scope, make a list of any annotations on the * underlying dataset sequence which are not already on the alignment. * * Add to a map of { alignmentSequence, } */ for (SequenceI seq : sequenceScope) { SequenceI dataset = seq.getDatasetSequence(); if (dataset == null) { continue; } AlignmentAnnotation[] datasetAnnotations = dataset.getAnnotation(); if (datasetAnnotations == null) { continue; } final List result = new ArrayList(); for (AlignmentAnnotation dsann : datasetAnnotations) { /* * Find matching annotations on the alignment. If none is found, then * add this annotation to the list of 'addable' annotations for this * sequence. */ final Iterable matchedAlignmentAnnotations = al .findAnnotations(seq, dsann.getCalcId(), dsann.label); if (!matchedAlignmentAnnotations.iterator().hasNext()) { result.add(dsann); if (labelForCalcId != null) { labelForCalcId.put(dsann.getCalcId(), dsann.label); } } } /* * Save any addable annotations for this sequence */ if (!result.isEmpty()) { candidates.put(seq, result); } } } /** * Adds annotations to the top of the alignment annotations, in the same order * as their related sequences. * * @param annotations * the annotations to add * @param alignment * the alignment to add them to * @param selectionGroup * current selection group (or null if none) */ public static void addReferenceAnnotations( Map> annotations, final AlignmentI alignment, final SequenceGroup selectionGroup) { for (SequenceI seq : annotations.keySet()) { for (AlignmentAnnotation ann : annotations.get(seq)) { AlignmentAnnotation copyAnn = new AlignmentAnnotation(ann); int startRes = 0; int endRes = ann.annotations.length; if (selectionGroup != null) { startRes = selectionGroup.getStartRes(); endRes = selectionGroup.getEndRes(); } copyAnn.restrict(startRes, endRes); /* * Add to the sequence (sets copyAnn.datasetSequence), unless the * original annotation is already on the sequence. */ if (!seq.hasAnnotation(ann)) { seq.addAlignmentAnnotation(copyAnn); } // adjust for gaps copyAnn.adjustForAlignment(); // add to the alignment and set visible alignment.addAnnotation(copyAnn); copyAnn.visible = true; } } } /** * Set visibility of alignment annotations of specified types (labels), for * specified sequences. This supports controls like * "Show all secondary structure", "Hide all Temp factor", etc. * * @al the alignment to scan for annotations * @param types * the types (labels) of annotations to be updated * @param forSequences * if not null, only annotations linked to one of these sequences are * in scope for update; if null, acts on all sequence annotations * @param anyType * if this flag is true, 'types' is ignored (label not checked) * @param doShow * if true, set visibility on, else set off */ public static void showOrHideSequenceAnnotations(AlignmentI al, Collection types, List forSequences, boolean anyType, boolean doShow) { for (AlignmentAnnotation aa : al.getAlignmentAnnotation()) { if (anyType || types.contains(aa.label)) { if ((aa.sequenceRef != null) && (forSequences == null || forSequences .contains(aa.sequenceRef))) { aa.visible = doShow; } } } } /** * Returns true if either sequence has a cross-reference to the other * * @param seq1 * @param seq2 * @return */ public static boolean haveCrossRef(SequenceI seq1, SequenceI seq2) { // Note: moved here from class CrossRef as the latter class has dependencies // not availability to the applet's classpath return hasCrossRef(seq1, seq2) || hasCrossRef(seq2, seq1); } /** * Returns true if seq1 has a cross-reference to seq2. Currently this assumes * that sequence name is structured as Source|AccessionId. * * @param seq1 * @param seq2 * @return */ public static boolean hasCrossRef(SequenceI seq1, SequenceI seq2) { if (seq1 == null || seq2 == null) { return false; } String name = seq2.getName(); final DBRefEntry[] xrefs = seq1.getDBRefs(); if (xrefs != null) { for (DBRefEntry xref : xrefs) { String xrefName = xref.getSource() + "|" + xref.getAccessionId(); // case-insensitive test, consistent with DBRefEntry.equalRef() if (xrefName.equalsIgnoreCase(name)) { return true; } } } return false; } /** * Constructs an alignment consisting of the mapped (CDS) regions in the given * nucleotide sequences, and updates mappings to match. The new sequences are * aligned as per the original sequences (with gapped columns omitted). * * @param dna * aligned dna sequences * @param mappings * from dna to protein; these are replaced with new mappings * @param gapChar * @return an alignment whose sequences are the cds-only parts of the dna * sequences (or null if no mappings are found) */ public static AlignmentI makeCdsAlignment(SequenceI[] dna, List mappings, char gapChar) { List cdsColumns = findCdsColumns(dna); /* * create CDS sequences and new mappings * (from cdna to cds, and cds to peptide) */ List newMappings = new ArrayList(); List cdsSequences = new ArrayList(); for (SequenceI dnaSeq : dna) { final SequenceI ds = dnaSeq.getDatasetSequence(); List seqMappings = MappingUtils .findMappingsForSequence(ds, mappings); for (AlignedCodonFrame acf : seqMappings) { AlignedCodonFrame newMapping = new AlignedCodonFrame(); final List mappedCds = makeCdsSequences(dnaSeq, acf, cdsColumns, newMapping, gapChar); if (!mappedCds.isEmpty()) { cdsSequences.addAll(mappedCds); newMappings.add(newMapping); } } } AlignmentI al = new Alignment( cdsSequences.toArray(new SequenceI[cdsSequences.size()])); al.setGapCharacter(gapChar); al.setDataset(null); /* * Replace the old mappings with the new ones */ mappings.clear(); mappings.addAll(newMappings); return al; } /** * Returns a consolidated list of column ranges where at least one sequence * has a CDS feature. This assumes CDS features are on genomic sequence i.e. * are for contiguous CDS ranges (no gaps). * * @param seqs * @return */ public static List findCdsColumns(SequenceI[] seqs) { // TODO use refactored code from AlignViewController // markColumnsContainingFeatures, not reinvent the wheel! List result = new ArrayList(); for (SequenceI seq : seqs) { result.addAll(findCdsColumns(seq)); } /* * sort and compact the list into ascending, non-overlapping ranges */ Collections.sort(result, new Comparator() { @Override public int compare(int[] o1, int[] o2) { return Integer.compare(o1[0], o2[0]); } }); result = MapList.coalesceRanges(result); return result; } public static List findCdsColumns(SequenceI seq) { List result = new ArrayList(); SequenceOntologyI so = SequenceOntologyFactory.getInstance(); SequenceFeature[] sfs = seq.getSequenceFeatures(); if (sfs != null) { for (SequenceFeature sf : sfs) { if (so.isA(sf.getType(), SequenceOntologyI.CDS)) { int colStart = seq.findIndex(sf.getBegin()); int colEnd = seq.findIndex(sf.getEnd()); result.add(new int[] { colStart, colEnd }); } } } return result; } /** * Answers true if all sequences have a gap at (or do not extend to) the * specified column position (base 1) * * @param seqs * @param col * @return */ public static boolean isGappedColumn(List seqs, int col) { if (seqs != null) { for (SequenceI seq : seqs) { if (!Comparison.isGap(seq.getCharAt(col - 1))) { return false; } } } return true; } /** * Returns the column ranges (base 1) of each aligned sequence that are * involved in any mapping. This is a helper method for aligning protein * products of aligned transcripts. * * @param mappedSequences * (possibly gapped) dna sequences * @param mappings * @return */ protected static List> getMappedColumns( List mappedSequences, List mappings) { List> result = new ArrayList>(); for (SequenceI seq : mappedSequences) { List columns = new ArrayList(); List seqMappings = MappingUtils .findMappingsForSequence(seq, mappings); for (AlignedCodonFrame mapping : seqMappings) { List maps = mapping.getMappingsForSequence(seq); for (Mapping map : maps) { /* * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc } * Find and add the overall aligned column range for each */ for (int[] cdsRange : map.getMap().getFromRanges()) { int startPos = cdsRange[0]; int endPos = cdsRange[1]; int startCol = seq.findIndex(startPos); int endCol = seq.findIndex(endPos); columns.add(new int[] { startCol, endCol }); } } } result.add(columns); } return result; } /** * Helper method to make cds-only sequences and populate their mappings to * protein products *

* For example, if ggCCaTTcGAg has mappings [3, 4, 6, 7, 9, 10] to protein * then generate a sequence CCTTGA with mapping [1, 6] to the same protein * residues *

* Typically eukaryotic dna will include cds encoding for a single peptide * sequence i.e. return a single result. Bacterial dna may have overlapping * cds mappings coding for multiple peptides so return multiple results * (example EMBL KF591215). * * @param dnaSeq * a dna aligned sequence * @param mapping * containing one or more mappings of the sequence to protein * @param ungappedCdsColumns * @param newMappings * the new mapping to populate, from the cds-only sequences to their * mapped protein sequences * @return */ protected static List makeCdsSequences(SequenceI dnaSeq, AlignedCodonFrame mapping, List ungappedCdsColumns, AlignedCodonFrame newMappings, char gapChar) { List cdsSequences = new ArrayList(); List seqMappings = mapping.getMappingsForSequence(dnaSeq); for (Mapping seqMapping : seqMappings) { SequenceI cds = makeCdsSequence(dnaSeq, seqMapping, ungappedCdsColumns, gapChar); cdsSequences.add(cds); /* * add new mappings, from dna to cds, and from cds to peptide */ MapList dnaToCds = addCdsMappings(dnaSeq.getDatasetSequence(), cds, seqMapping, newMappings); /* * transfer any features on dna that overlap the CDS */ transferFeatures(dnaSeq, cds, dnaToCds, null, SequenceOntologyI.CDS); } return cdsSequences; } /** * Transfers co-located features on 'fromSeq' to 'toSeq', adjusting the * feature start/end ranges, optionally omitting specified feature types. * Returns the number of features copied. * * @param fromSeq * @param toSeq * @param select * if not null, only features of this type are copied (including * subtypes in the Sequence Ontology) * @param mapping * the mapping from 'fromSeq' to 'toSeq' * @param omitting */ public static int transferFeatures(SequenceI fromSeq, SequenceI toSeq, MapList mapping, String select, String... omitting) { SequenceI copyTo = toSeq; while (copyTo.getDatasetSequence() != null) { copyTo = copyTo.getDatasetSequence(); } SequenceOntologyI so = SequenceOntologyFactory.getInstance(); int count = 0; SequenceFeature[] sfs = fromSeq.getSequenceFeatures(); if (sfs != null) { for (SequenceFeature sf : sfs) { String type = sf.getType(); if (select != null && !so.isA(type, select)) { continue; } boolean omit = false; for (String toOmit : omitting) { if (type.equals(toOmit)) { omit = true; } } if (omit) { continue; } /* * locate the mapped range - null if either start or end is * not mapped (no partial overlaps are calculated) */ int start = sf.getBegin(); int end = sf.getEnd(); int[] mappedTo = mapping.locateInTo(start, end); /* * if whole exon range doesn't map, try interpreting it * as 5' or 3' exon overlapping the CDS range */ if (mappedTo == null) { mappedTo = mapping.locateInTo(end, end); if (mappedTo != null) { /* * end of exon is in CDS range - 5' overlap * to a range from the start of the peptide */ mappedTo[0] = 1; } } if (mappedTo == null) { mappedTo = mapping.locateInTo(start, start); if (mappedTo != null) { /* * start of exon is in CDS range - 3' overlap * to a range up to the end of the peptide */ mappedTo[1] = toSeq.getLength(); } } if (mappedTo != null) { SequenceFeature copy = new SequenceFeature(sf); copy.setBegin(Math.min(mappedTo[0], mappedTo[1])); copy.setEnd(Math.max(mappedTo[0], mappedTo[1])); copyTo.addSequenceFeature(copy); count++; } } } return count; } /** * Creates and adds mappings *

from cds to peptide
from dna to cds

* and returns the dna-to-cds mapping * * @param dnaSeq * @param cdsSeq * @param dnaMapping * @param newMappings * @return */ protected static MapList addCdsMappings(SequenceI dnaSeq, SequenceI cdsSeq, Mapping dnaMapping, AlignedCodonFrame newMappings) { cdsSeq.createDatasetSequence(); /* * CDS to peptide is just a contiguous 3:1 mapping, with * the peptide ranges taken unchanged from the dna mapping */ List cdsRanges = new ArrayList(); SequenceI cdsDataset = cdsSeq.getDatasetSequence(); cdsRanges.add(new int[] { 1, cdsDataset.getLength() }); MapList cdsToPeptide = new MapList(cdsRanges, dnaMapping.getMap() .getToRanges(), 3, 1); newMappings.addMap(cdsDataset, dnaMapping.getTo(), cdsToPeptide); /* * dna 'from' ranges map 1:1 to the contiguous extracted CDS */ MapList dnaToCds = new MapList(dnaMapping.getMap().getFromRanges(), cdsRanges, 1, 1); newMappings.addMap(dnaSeq, cdsDataset, dnaToCds); return dnaToCds; } /** * Makes and returns a CDS-only sequence, where the CDS regions are identified * as the 'from' ranges of the mapping on the dna. * * @param dnaSeq * nucleotide sequence * @param seqMapping * mappings from CDS regions of nucleotide * @param ungappedCdsColumns * @return */ protected static SequenceI makeCdsSequence(SequenceI dnaSeq, Mapping seqMapping, List ungappedCdsColumns, char gapChar) { int cdsWidth = MappingUtils.getLength(ungappedCdsColumns); /* * populate CDS columns with the aligned * column character if that column is mapped (which may be a gap * if an intron interrupts a codon), else with a gap */ List fromRanges = seqMapping.getMap().getFromRanges(); char[] cdsChars = new char[cdsWidth]; int pos = 0; for (int[] columns : ungappedCdsColumns) { for (int i = columns[0]; i <= columns[1]; i++) { char dnaChar = dnaSeq.getCharAt(i - 1); if (Comparison.isGap(dnaChar)) { cdsChars[pos] = gapChar; } else { int seqPos = dnaSeq.findPosition(i - 1); if (MappingUtils.contains(fromRanges, seqPos)) { cdsChars[pos] = dnaChar; } else { cdsChars[pos] = gapChar; } } pos++; } } SequenceI cdsSequence = new Sequence(dnaSeq.getName(), String.valueOf(cdsChars)); transferDbRefs(seqMapping.getTo(), cdsSequence); return cdsSequence; } /** * Locate any xrefs to CDS databases on the protein product and attach to the * CDS sequence. Also add as a sub-token of the sequence name. * * @param from * @param to */ protected static void transferDbRefs(SequenceI from, SequenceI to) { String cdsAccId = FeatureProperties.getCodingFeature(DBRefSource.EMBL); DBRefEntry[] cdsRefs = DBRefUtils.selectRefs(from.getDBRefs(), DBRefSource.CODINGDBS); if (cdsRefs != null) { for (DBRefEntry cdsRef : cdsRefs) { to.addDBRef(new DBRefEntry(cdsRef)); cdsAccId = cdsRef.getAccessionId(); } } if (!to.getName().contains(cdsAccId)) { to.setName(to.getName() + "|" + cdsAccId); } } }