X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fanalysis%2FAlignmentUtils.java;h=7d3fec2ee670fb5501c181c9a91ec84e228ab121;hb=e309b8d9bd62fb304fdd612c7385e76027d2f2d7;hp=b55d58d08bf15982cd3c99fcc458c643c9455b9b;hpb=ad15cff29620f960119f80176f1fd443da9f6763;p=jalview.git diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index b55d58d..7d3fec2 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -20,12 +20,27 @@ */ package jalview.analysis; +import jalview.datamodel.AlignedCodon; +import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentI; +import jalview.datamodel.Mapping; +import jalview.datamodel.SearchResults; +import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; +import jalview.schemes.ResidueProperties; +import jalview.util.MapList; import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.TreeMap; /** * grab bag of useful alignment manipulation operations Expect these to be @@ -38,6 +53,15 @@ public class AlignmentUtils { /** + * Represents the 3 possible results of trying to map one alignment to + * another. + */ + public enum MappingResult + { + Mapped, NotMapped, AlreadyMapped + } + + /** * given an existing alignment, create a new alignment including all, or up to * flankSize additional symbols from each sequence's dataset sequence * @@ -159,4 +183,723 @@ public class AlignmentUtils } return result; } + + /** + * Returns a map of lists of sequences in the alignment, keyed by sequence + * name. For use in mapping between different alignment views of the same + * sequences. + * + * @see jalview.datamodel.AlignmentI#getSequencesByName() + */ + public static Map> getSequencesByName( + AlignmentI al) + { + Map> theMap = new LinkedHashMap>(); + for (SequenceI seq : al.getSequences()) + { + String name = seq.getName(); + if (name != null) + { + List seqs = theMap.get(name); + if (seqs == null) + { + seqs = new ArrayList(); + theMap.put(name, seqs); + } + seqs.add(seq); + } + } + return theMap; + } + + /** + * Build mapping of protein to cDNA alignment. Mappings are made between + * sequences which have the same name and compatible lengths. Any new mappings + * are added to the protein alignment. Has a 3-valued result: either Mapped + * (at least one sequence mapping was created), AlreadyMapped (all possible + * sequence mappings already exist), or NotMapped (no possible sequence + * mappings exist). + * + * @param proteinAlignment + * @param cdnaAlignment + * @return + */ + public static MappingResult mapProteinToCdna( + final AlignmentI proteinAlignment, + final AlignmentI cdnaAlignment) + { + if (proteinAlignment == null || cdnaAlignment == null) + { + return MappingResult.NotMapped; + } + + boolean mappingPossible = false; + boolean mappingPerformed = false; + + List thisSeqs = proteinAlignment.getSequences(); + + /* + * Build a look-up of cDNA sequences by name, for matching purposes. + */ + Map> cdnaSeqs = cdnaAlignment + .getSequencesByName(); + + for (SequenceI aaSeq : thisSeqs) + { + AlignedCodonFrame acf = new AlignedCodonFrame(); + List candidates = cdnaSeqs.get(aaSeq.getName()); + if (candidates == null) + { + /* + * No cDNA sequence with matching name, so no mapping possible for this + * protein sequence + */ + continue; + } + mappingPossible = true; + for (SequenceI cdnaSeq : candidates) + { + if (!mappingExists(proteinAlignment.getCodonFrames(), + aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence())) + { + MapList map = mapProteinToCdna(aaSeq, cdnaSeq); + if (map != null) + { + acf.addMap(cdnaSeq, aaSeq, map); + mappingPerformed = true; + } + } + } + proteinAlignment.addCodonFrame(acf); + } + + /* + * If at least one mapping was possible but none was done, then the + * alignments are already as mapped as they can be. + */ + if (mappingPossible && !mappingPerformed) + { + return MappingResult.AlreadyMapped; + } + else + { + return mappingPerformed ? MappingResult.Mapped + : MappingResult.NotMapped; + } + } + + /** + * Answers true if the mappings include one between the given (dataset) + * sequences. + */ + public static boolean mappingExists(Set set, + SequenceI aaSeq, SequenceI cdnaSeq) + { + if (set != null) + { + for (AlignedCodonFrame acf : set) + { + if (cdnaSeq == acf.getDnaForAaSeq(aaSeq)) + { + return true; + } + } + } + return false; + } + + /** + * Build a mapping (if possible) of a protein to a cDNA sequence. The cDNA + * must be three times the length of the protein, possibly after ignoring + * start and/or stop codons. Returns null if no mapping is determined. + * + * @param proteinSeqs + * @param cdnaSeq + * @return + */ + public static MapList mapProteinToCdna(SequenceI proteinSeq, + SequenceI cdnaSeq) + { + /* + * Here we handle either dataset sequence set (desktop) or absent (applet) + */ + final SequenceI proteinDataset = proteinSeq.getDatasetSequence(); + String aaSeqString = proteinDataset != null ? proteinDataset + .getSequenceAsString() : proteinSeq.getSequenceAsString(); + final SequenceI cdnaDataset = cdnaSeq.getDatasetSequence(); + String cdnaSeqString = cdnaDataset != null ? cdnaDataset + .getSequenceAsString() : cdnaSeq.getSequenceAsString(); + if (aaSeqString == null || cdnaSeqString == null) + { + return null; + } + + final int mappedLength = 3 * aaSeqString.length(); + int cdnaLength = cdnaSeqString.length(); + int cdnaStart = 1; + int cdnaEnd = cdnaLength; + final int proteinStart = 1; + final int proteinEnd = aaSeqString.length(); + + /* + * If lengths don't match, try ignoring stop codon. + */ + if (cdnaLength != mappedLength) + { + for (Object stop : ResidueProperties.STOP) + { + if (cdnaSeqString.toUpperCase().endsWith((String) stop)) + { + cdnaEnd -= 3; + cdnaLength -= 3; + break; + } + } + } + + /* + * If lengths still don't match, try ignoring start codon. + */ + if (cdnaLength != mappedLength + && cdnaSeqString.toUpperCase().startsWith( + ResidueProperties.START)) + { + cdnaStart += 3; + cdnaLength -= 3; + } + + if (cdnaLength == mappedLength) + { + MapList map = new MapList(new int[] + { cdnaStart, cdnaEnd }, new int[] + { proteinStart, proteinEnd }, 3, 1); + return map; + } + else + { + return null; + } + } + + /** + * Align sequence 'seq' to match the alignment of a mapped sequence. Note this + * currently assumes that we are aligning cDNA to match protein. + * + * @param seq + * the sequence to be realigned + * @param al + * the alignment whose sequence alignment is to be 'copied' + * @param gap + * character string represent a gap in the realigned sequence + * @param preserveUnmappedGaps + * @param preserveMappedGaps + * @return true if the sequence was realigned, false if it could not be + */ + public static boolean alignSequenceAs(SequenceI seq, AlignmentI al, + String gap, boolean preserveMappedGaps, + boolean preserveUnmappedGaps) + { + /* + * Get any mappings from the source alignment to the target (dataset) sequence. + */ + // TODO there may be one AlignedCodonFrame per dataset sequence, or one with + // all mappings. Would it help to constrain this? + List mappings = al.getCodonFrame(seq); + if (mappings == null || mappings.isEmpty()) + { + return false; + } + + /* + * Locate the aligned source sequence whose dataset sequence is mapped. We + * just take the first match here (as we can't align cDNA like more than one + * protein sequence). + */ + SequenceI alignFrom = null; + AlignedCodonFrame mapping = null; + for (AlignedCodonFrame mp : mappings) + { + alignFrom = mp.findAlignedSequence(seq.getDatasetSequence(), al); + if (alignFrom != null) + { + mapping = mp; + break; + } + } + + if (alignFrom == null) + { + return false; + } + alignSequenceAs(seq, alignFrom, mapping, gap, al.getGapCharacter(), + preserveMappedGaps, preserveUnmappedGaps); + return true; + } + + /** + * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to + * match residues and codons. Flags control whether existing gaps in unmapped + * (intron) and mapped (exon) regions are preserved or not. Gaps linking intro + * and exon are only retained if both flags are set. + * + * @param alignTo + * @param alignFrom + * @param mapping + * @param myGap + * @param sourceGap + * @param preserveUnmappedGaps + * @param preserveMappedGaps + */ + public static void alignSequenceAs(SequenceI alignTo, + SequenceI alignFrom, + AlignedCodonFrame mapping, String myGap, char sourceGap, + boolean preserveMappedGaps, boolean preserveUnmappedGaps) + { + // TODO generalise to work for Protein-Protein, dna-dna, dna-protein + final char[] thisSeq = alignTo.getSequence(); + final char[] thatAligned = alignFrom.getSequence(); + StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length); + + // aligned and dataset sequence positions, all base zero + int thisSeqPos = 0; + int sourceDsPos = 0; + + int basesWritten = 0; + char myGapChar = myGap.charAt(0); + int ratio = myGap.length(); + + /* + * Traverse the aligned protein sequence. + */ + int sourceGapMappedLength = 0; + boolean inExon = false; + for (char sourceChar : thatAligned) + { + if (sourceChar == sourceGap) + { + sourceGapMappedLength += ratio; + continue; + } + + /* + * Found a residue. Locate its mapped codon (start) position. + */ + sourceDsPos++; + // Note mapping positions are base 1, our sequence positions base 0 + int[] mappedPos = mapping.getMappedRegion(alignTo, alignFrom, + sourceDsPos); + if (mappedPos == null) + { + /* + * Abort realignment if unmapped protein. Or could ignore it?? + */ + System.err.println("Can't align: no codon mapping to residue " + + sourceDsPos + "(" + sourceChar + ")"); + return; + } + + int mappedCodonStart = mappedPos[0]; // position (1...) of codon start + int mappedCodonEnd = mappedPos[mappedPos.length - 1]; // codon end pos + StringBuilder trailingCopiedGap = new StringBuilder(); + + /* + * Copy dna sequence up to and including this codon. Optionally, include + * gaps before the codon starts (in introns) and/or after the codon starts + * (in exons). + * + * Note this only works for 'linear' splicing, not reverse or interleaved. + * But then 'align dna as protein' doesn't make much sense otherwise. + */ + int intronLength = 0; + while (basesWritten < mappedCodonEnd && thisSeqPos < thisSeq.length) + { + final char c = thisSeq[thisSeqPos++]; + if (c != myGapChar) + { + basesWritten++; + + if (basesWritten < mappedCodonStart) + { + /* + * Found an unmapped (intron) base. First add in any preceding gaps + * (if wanted). + */ + if (preserveUnmappedGaps && trailingCopiedGap.length() > 0) + { + thisAligned.append(trailingCopiedGap.toString()); + intronLength += trailingCopiedGap.length(); + trailingCopiedGap = new StringBuilder(); + } + intronLength++; + inExon = false; + } + else + { + final boolean startOfCodon = basesWritten == mappedCodonStart; + int gapsToAdd = calculateGapsToInsert(preserveMappedGaps, + preserveUnmappedGaps, sourceGapMappedLength, inExon, + trailingCopiedGap.length(), intronLength, startOfCodon); + for (int i = 0; i < gapsToAdd; i++) + { + thisAligned.append(myGapChar); + } + sourceGapMappedLength = 0; + inExon = true; + } + thisAligned.append(c); + trailingCopiedGap = new StringBuilder(); + } + else + { + if (inExon && preserveMappedGaps) + { + trailingCopiedGap.append(myGapChar); + } + else if (!inExon && preserveUnmappedGaps) + { + trailingCopiedGap.append(myGapChar); + } + } + } + } + + /* + * At end of protein sequence. Copy any remaining dna sequence, optionally + * including (intron) gaps. We do not copy trailing gaps in protein. + */ + while (thisSeqPos < thisSeq.length) + { + final char c = thisSeq[thisSeqPos++]; + if (c != myGapChar || preserveUnmappedGaps) + { + thisAligned.append(c); + } + } + + /* + * All done aligning, set the aligned sequence. + */ + alignTo.setSequence(new String(thisAligned)); + } + + /** + * Helper method to work out how many gaps to insert when realigning. + * + * @param preserveMappedGaps + * @param preserveUnmappedGaps + * @param sourceGapMappedLength + * @param inExon + * @param trailingCopiedGap + * @param intronLength + * @param startOfCodon + * @return + */ + protected static int calculateGapsToInsert(boolean preserveMappedGaps, + boolean preserveUnmappedGaps, int sourceGapMappedLength, + boolean inExon, int trailingGapLength, + int intronLength, final boolean startOfCodon) + { + int gapsToAdd = 0; + if (startOfCodon) + { + /* + * Reached start of codon. Ignore trailing gaps in intron unless we are + * preserving gaps in both exon and intron. Ignore them anyway if the + * protein alignment introduces a gap at least as large as the intronic + * region. + */ + if (inExon && !preserveMappedGaps) + { + trailingGapLength = 0; + } + if (!inExon && !(preserveMappedGaps && preserveUnmappedGaps)) + { + trailingGapLength = 0; + } + if (inExon) + { + gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength); + } + else + { + if (intronLength + trailingGapLength <= sourceGapMappedLength) + { + gapsToAdd = sourceGapMappedLength - intronLength; + } + else + { + gapsToAdd = Math.min(intronLength + trailingGapLength + - sourceGapMappedLength, trailingGapLength); + } + } + } + else + { + /* + * second or third base of codon; check for any gaps in dna + */ + if (!preserveMappedGaps) + { + trailingGapLength = 0; + } + gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength); + } + return gapsToAdd; + } + + /** + * Returns a list of sequences mapped from the given sequences and aligned + * (gapped) in the same way. For example, the cDNA for aligned protein, where + * a single gap in protein generates three gaps in cDNA. + * + * @param sequences + * @param gapCharacter + * @param mappings + * @return + */ + public static List getAlignedTranslation( + List sequences, char gapCharacter, + Set mappings) + { + List alignedSeqs = new ArrayList(); + + for (SequenceI seq : sequences) + { + List mapped = getAlignedTranslation(seq, gapCharacter, + mappings); + alignedSeqs.addAll(mapped); + } + return alignedSeqs; + } + + /** + * Returns sequences aligned 'like' the source sequence, as mapped by the + * given mappings. Normally we expect zero or one 'mapped' sequences, but this + * will support 1-to-many as well. + * + * @param seq + * @param gapCharacter + * @param mappings + * @return + */ + protected static List getAlignedTranslation(SequenceI seq, + char gapCharacter, Set mappings) + { + List result = new ArrayList(); + for (AlignedCodonFrame mapping : mappings) + { + if (mapping.involvesSequence(seq)) + { + SequenceI mapped = getAlignedTranslation(seq, gapCharacter, mapping); + if (mapped != null) + { + result.add(mapped); + } + } + } + return result; + } + + /** + * Returns the translation of 'seq' (as held in the mapping) with + * corresponding alignment (gaps). + * + * @param seq + * @param gapCharacter + * @param mapping + * @return + */ + protected static SequenceI getAlignedTranslation(SequenceI seq, + char gapCharacter, AlignedCodonFrame mapping) + { + String gap = String.valueOf(gapCharacter); + boolean toDna = false; + int fromRatio = 1; + SequenceI mapTo = mapping.getDnaForAaSeq(seq); + if (mapTo != null) + { + // mapping is from protein to nucleotide + toDna = true; + // should ideally get gap count ratio from mapping + gap = String.valueOf(new char[] + { gapCharacter, gapCharacter, gapCharacter }); + } + else + { + // mapping is from nucleotide to protein + mapTo = mapping.getAaForDnaSeq(seq); + fromRatio = 3; + } + StringBuilder newseq = new StringBuilder(seq.getLength() + * (toDna ? 3 : 1)); + + int residueNo = 0; // in seq, base 1 + int[] phrase = new int[fromRatio]; + int phraseOffset = 0; + int gapWidth = 0; + boolean first = true; + final Sequence alignedSeq = new Sequence("", ""); + + for (char c : seq.getSequence()) + { + if (c == gapCharacter) + { + gapWidth++; + if (gapWidth >= fromRatio) + { + newseq.append(gap); + gapWidth = 0; + } + } + else + { + phrase[phraseOffset++] = residueNo + 1; + if (phraseOffset == fromRatio) + { + /* + * Have read a whole codon (or protein residue), now translate: map + * source phrase to positions in target sequence add characters at + * these positions to newseq Note mapping positions are base 1, our + * sequence positions base 0. + */ + SearchResults sr = new SearchResults(); + for (int pos : phrase) + { + mapping.markMappedRegion(seq, pos, sr); + } + newseq.append(sr.toString()); + if (first) + { + first = false; + // Hack: Copy sequence dataset, name and description from + // SearchResults.match[0].sequence + // TODO? carry over sequence names from original 'complement' + // alignment + SequenceI mappedTo = sr.getResultSequence(0); + alignedSeq.setName(mappedTo.getName()); + alignedSeq.setDescription(mappedTo.getDescription()); + alignedSeq.setDatasetSequence(mappedTo); + } + phraseOffset = 0; + } + residueNo++; + } + } + alignedSeq.setSequence(newseq.toString()); + return alignedSeq; + } + + /** + * Realigns the given protein to match the alignment of the dna, using codon + * mappings to translate aligned codon positions to protein residues. + * + * @param protein + * the alignment whose sequences are realigned by this method + * @param dna + * the dna alignment whose alignment we are 'copying' + * @return the number of sequences that were realigned + */ + public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna) + { + Set mappings = protein.getCodonFrames(); + + /* + * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of + * {dnaSequence, {proteinSequence, codonProduct}} at that position. The + * comparator keeps the codon positions ordered. + */ + Map> alignedCodons = new TreeMap>( + new CodonComparator()); + for (SequenceI dnaSeq : dna.getSequences()) + { + for (AlignedCodonFrame mapping : mappings) + { + Mapping seqMap = mapping.getMappingForSequence(dnaSeq); + SequenceI prot = mapping.findAlignedSequence( + dnaSeq.getDatasetSequence(), protein); + if (prot != null) + { + addCodonPositions(dnaSeq, prot, protein.getGapCharacter(), + seqMap, alignedCodons); + } + } + } + return alignProteinAs(protein, alignedCodons); + } + + /** + * Update the aligned protein sequences to match the codon alignments given in + * the map. + * + * @param protein + * @param alignedCodons + * an ordered map of codon positions (columns), with sequence/peptide + * values present in each column + * @return + */ + protected static int alignProteinAs(AlignmentI protein, + Map> alignedCodons) + { + /* + * Prefill aligned sequences with gaps before inserting aligned protein + * residues. + */ + int alignedWidth = alignedCodons.size(); + char[] gaps = new char[alignedWidth]; + Arrays.fill(gaps, protein.getGapCharacter()); + String allGaps = String.valueOf(gaps); + for (SequenceI seq : protein.getSequences()) + { + seq.setSequence(allGaps); + } + + int column = 0; + for (AlignedCodon codon : alignedCodons.keySet()) + { + final Map columnResidues = alignedCodons.get(codon); + for (Entry entry : columnResidues + .entrySet()) + { + // place translated codon at its column position in sequence + entry.getKey().getSequence()[column] = entry.getValue().charAt(0); + } + column++; + } + return 0; + } + + /** + * Populate the map of aligned codons by traversing the given sequence + * mapping, locating the aligned positions of mapped codons, and adding those + * positions and their translation products to the map. + * + * @param dna + * the aligned sequence we are mapping from + * @param protein + * the sequence to be aligned to the codons + * @param gapChar + * the gap character in the dna sequence + * @param seqMap + * a mapping to a sequence translation + * @param alignedCodons + * the map we are building up + */ + static void addCodonPositions(SequenceI dna, SequenceI protein, + char gapChar, + Mapping seqMap, + Map> alignedCodons) + { + Iterator codons = seqMap.getCodonIterator(dna, gapChar); + while (codons.hasNext()) + { + AlignedCodon codon = codons.next(); + Map seqProduct = alignedCodons.get(codon); + if (seqProduct == null) + { + seqProduct = new HashMap(); + alignedCodons.put(codon, seqProduct); + } + seqProduct.put(protein, codon.product); + } + } }