X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fanalysis%2FAlignmentUtils.java;h=2dbe015954d472a86ac8b8a4c30ecd818068e68a;hb=61f1a8b75ea5ce352d6214c34fbdcd58bafbbb73;hp=85699a4fa29144076e6d027f0ae7a18233f32f4c;hpb=b55d5f8c5ce50ee068c8345c1bdb3dbc711a74df;p=jalview.git
diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java
index 85699a4..2dbe015 100644
--- a/src/jalview/analysis/AlignmentUtils.java
+++ b/src/jalview/analysis/AlignmentUtils.java
@@ -1,22 +1,60 @@
+/*
+ * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.2)
+ * Copyright (C) 2014 The Jalview Authors
+ *
+ * This file is part of Jalview.
+ *
+ * Jalview is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, either version 3
+ * of the License, or (at your option) any later version.
+ *
+ * Jalview is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Jalview. If not, see .
+ * The Jalview Authors are detailed in the 'AUTHORS' file.
+ */
package jalview.analysis;
+import jalview.datamodel.AlignedCodonFrame;
+import jalview.datamodel.AlignmentAnnotation;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.SequenceI;
+import jalview.schemes.ResidueProperties;
+import jalview.util.MapList;
+
import java.util.ArrayList;
+import java.util.LinkedHashMap;
import java.util.List;
-
-import jalview.datamodel.SequenceI;
-import jalview.datamodel.AlignmentI;
+import java.util.Map;
/**
- * grab bag of useful alignment manipulation operations
- * Expect these to be refactored elsewhere at some point.
+ * grab bag of useful alignment manipulation operations Expect these to be
+ * refactored elsewhere at some point.
+ *
* @author jimp
- *
+ *
*/
public class AlignmentUtils
{
/**
- * given an existing alignment, create a new alignment including all, or up to flankSize additional symbols from each sequence's dataset sequence
+ * Represents the 3 possible results of trying to map one alignment to
+ * another.
+ */
+ public enum MappingResult
+ {
+ Mapped, NotMapped, AlreadyMapped
+ }
+
+ /**
+ * given an existing alignment, create a new alignment including all, or up to
+ * flankSize additional symbols from each sequence's dataset sequence
+ *
* @param core
* @param flankSize
* @return AlignmentI
@@ -25,68 +63,489 @@ public class AlignmentUtils
{
List sq = new ArrayList();
int maxoffset = 0;
- for (SequenceI s:core.getSequences())
+ for (SequenceI s : core.getSequences())
{
SequenceI newSeq = s.deriveSequence();
- if (newSeq.getStart()>maxoffset && newSeq.getDatasetSequence().getStart() maxoffset
+ && newSeq.getDatasetSequence().getStart() < s.getStart())
{
maxoffset = newSeq.getStart();
}
sq.add(newSeq);
}
- if (flankSize>-1) {
+ if (flankSize > -1)
+ {
maxoffset = flankSize;
}
// now add offset to create a new expanded alignment
- for (SequenceI s:sq)
+ for (SequenceI s : sq)
{
SequenceI ds = s;
- while (ds.getDatasetSequence()!=null) {
- ds=ds.getDatasetSequence();
+ while (ds.getDatasetSequence() != null)
+ {
+ ds = ds.getDatasetSequence();
}
- int s_end = s.findPosition(s.getStart()+s.getLength());
+ int s_end = s.findPosition(s.getStart() + s.getLength());
// find available flanking residues for sequence
- int ustream_ds=s.getStart()-ds.getStart(),dstream_ds=ds.getEnd()-s_end;
-
+ int ustream_ds = s.getStart() - ds.getStart(), dstream_ds = ds
+ .getEnd() - s_end;
+
// build new flanked sequence
-
+
// compute gap padding to start of flanking sequence
- int offset=maxoffset - ustream_ds;
-
+ int offset = maxoffset - ustream_ds;
+
// padding is gapChar x ( maxoffset - min(ustream_ds, flank)
- if (flankSize>=0) {
- if (flankSize= 0)
+ {
+ if (flankSize < ustream_ds)
{
- // take up to flankSize residues
- offset = maxoffset-flankSize;
- ustream_ds = flankSize;
- }
- if (flankSize> getSequencesByName(
+ AlignmentI al)
+ {
+ Map> theMap = new LinkedHashMap>();
+ for (SequenceI seq : al.getSequences())
+ {
+ String name = seq.getName();
+ if (name != null)
+ {
+ List seqs = theMap.get(name);
+ if (seqs == null)
+ {
+ seqs = new ArrayList();
+ theMap.put(name, seqs);
+ }
+ seqs.add(seq);
+ }
+ }
+ return theMap;
+ }
+
+ /**
+ * Build mapping of protein to cDNA alignment. Mappings are made between
+ * sequences which have the same name and compatible lengths. Has a 3-valued
+ * result: either Mapped (at least one sequence mapping was created),
+ * AlreadyMapped (all possible sequence mappings already exist), or NotMapped
+ * (no possible sequence mappings exist).
+ *
+ * @param proteinAlignment
+ * @param cdnaAlignment
+ * @return
+ */
+ public static MappingResult mapProteinToCdna(
+ final AlignmentI proteinAlignment,
+ final AlignmentI cdnaAlignment)
+ {
+ boolean mappingPossible = false;
+ boolean mappingPerformed = false;
+
+ List thisSeqs = proteinAlignment.getSequences();
+
+ /*
+ * Build a look-up of cDNA sequences by name, for matching purposes.
+ */
+ Map> cdnaSeqs = cdnaAlignment
+ .getSequencesByName();
+
+ for (SequenceI aaSeq : thisSeqs)
+ {
+ AlignedCodonFrame acf = new AlignedCodonFrame();
+ List candidates = cdnaSeqs.get(aaSeq.getName());
+ if (candidates == null)
+ {
+ /*
+ * No cDNA sequence with matching name, so no mapping possible for this
+ * protein sequence
+ */
+ continue;
+ }
+ mappingPossible = true;
+ for (SequenceI cdnaSeq : candidates)
+ {
+ if (!mappingExists(proteinAlignment.getCodonFrames(),
+ aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence()))
+ {
+ MapList map = mapProteinToCdna(aaSeq, cdnaSeq);
+ if (map != null)
+ {
+ acf.addMap(cdnaSeq, aaSeq, map);
+ mappingPerformed = true;
+ }
+ }
+ }
+ proteinAlignment.addCodonFrame(acf);
+ }
+
+ /*
+ * If at least one mapping was possible but none was done, then the
+ * alignments are already as mapped as they can be.
+ */
+ if (mappingPossible && !mappingPerformed)
+ {
+ return MappingResult.AlreadyMapped;
+ }
+ else
+ {
+ return mappingPerformed ? MappingResult.Mapped
+ : MappingResult.NotMapped;
+ }
+ }
+
+ /**
+ * Answers true if the mappings include one between the given (dataset)
+ * sequences.
+ */
+ public static boolean mappingExists(AlignedCodonFrame[] codonFrames,
+ SequenceI aaSeq, SequenceI cdnaSeq)
+ {
+ if (codonFrames != null)
+ {
+ for (AlignedCodonFrame acf : codonFrames)
+ {
+ if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
+ {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Build a mapping (if possible) of a protein to a cDNA sequence. The cDNA
+ * must be three times the length of the protein, possibly after ignoring
+ * start and/or stop codons. Returns null if no mapping is determined.
+ *
+ * @param proteinSeqs
+ * @param cdnaSeq
+ * @return
+ */
+ public static MapList mapProteinToCdna(SequenceI proteinSeq,
+ SequenceI cdnaSeq)
+ {
+ String aaSeqString = proteinSeq.getDatasetSequence()
+ .getSequenceAsString();
+ String cdnaSeqString = cdnaSeq.getDatasetSequence()
+ .getSequenceAsString();
+ if (aaSeqString == null || cdnaSeqString == null)
+ {
+ return null;
+ }
+
+ final int mappedLength = 3 * aaSeqString.length();
+ int cdnaLength = cdnaSeqString.length();
+ int cdnaStart = 1;
+ int cdnaEnd = cdnaLength;
+ final int proteinStart = 1;
+ final int proteinEnd = aaSeqString.length();
+
+ /*
+ * If lengths don't match, try ignoring stop codon.
+ */
+ if (cdnaLength != mappedLength)
+ {
+ for (Object stop : ResidueProperties.STOP)
+ {
+ if (cdnaSeqString.toUpperCase().endsWith((String) stop))
+ {
+ cdnaEnd -= 3;
+ cdnaLength -= 3;
+ break;
+ }
+ }
+ }
+
+ /*
+ * If lengths still don't match, try ignoring start codon.
+ */
+ if (cdnaLength != mappedLength
+ && cdnaSeqString.toUpperCase().startsWith(
+ ResidueProperties.START))
+ {
+ cdnaStart += 3;
+ cdnaLength -= 3;
+ }
+
+ if (cdnaLength == mappedLength)
+ {
+ MapList map = new MapList(new int[]
+ { cdnaStart, cdnaEnd }, new int[]
+ { proteinStart, proteinEnd }, 3, 1);
+ return map;
+ }
+ else
+ {
+ return null;
+ }
+ }
+
+ /**
+ * Align sequence 'seq' to match the alignment of a mapped sequence. Note this
+ * currently assumes that we are aligning cDNA to match protein.
+ *
+ * @param seq
+ * the sequence to be realigned
+ * @param al
+ * the alignment whose sequence alignment is to be 'copied'
+ * @param gap
+ * character string represent a gap in the realigned sequence
+ * @param preserveUnmappedGaps
+ * @param preserveMappedGaps
+ * @return true if the sequence was realigned, false if it could not be
+ */
+ public static boolean alignSequenceAs(SequenceI seq, AlignmentI al,
+ String gap, boolean preserveMappedGaps,
+ boolean preserveUnmappedGaps)
+ {
+ /*
+ * Get any mappings from the source alignment to the target (dataset) sequence.
+ */
+ // TODO there may be one AlignedCodonFrame per dataset sequence, or one with
+ // all mappings. Would it help to constrain this?
+ AlignedCodonFrame[] mappings = al.getCodonFrame(seq);
+ if (mappings == null)
+ {
+ return false;
+ }
+
+ /*
+ * Locate the aligned source sequence whose dataset sequence is mapped. We
+ * just take the first match here (as we can't align cDNA like more than one
+ * protein sequence).
+ */
+ SequenceI alignFrom = null;
+ AlignedCodonFrame mapping = null;
+ for (AlignedCodonFrame mp : mappings)
+ {
+ alignFrom = mp.findAlignedSequence(seq.getDatasetSequence(), al);
+ if (alignFrom != null)
+ {
+ mapping = mp;
+ break;
+ }
+ }
+
+ if (alignFrom == null)
+ {
+ return false;
+ }
+ alignSequenceAs(seq, alignFrom, mapping, gap, al.getGapCharacter(),
+ preserveMappedGaps, preserveUnmappedGaps);
+ return true;
+ }
+
+ /**
+ * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to
+ * match residues and codons.
+ *
+ * @param alignTo
+ * @param alignFrom
+ * @param mapping
+ * @param myGap
+ * @param sourceGap
+ * @param preserveUnmappedGaps
+ * @param preserveMappedGaps
+ */
+ public static void alignSequenceAs(SequenceI alignTo,
+ SequenceI alignFrom,
+ AlignedCodonFrame mapping, String myGap, char sourceGap,
+ boolean preserveMappedGaps, boolean preserveUnmappedGaps)
+ {
+ // TODO generalise to work for Protein-Protein, dna-dna, dna-protein
+ final char[] thisSeq = alignTo.getSequence();
+ final char[] thatAligned = alignFrom.getSequence();
+ StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length);
+
+ // aligned and dataset sequence positions, all base zero
+ int thisSeqPos = 0;
+ int sourceDsPos = 0;
+
+ int basesWritten = 0;
+ char myGapChar = myGap.charAt(0);
+ int ratio = myGap.length();
+
+ /*
+ * Traverse the aligned protein sequence.
+ */
+ int sourceGapLength = 0;
+ for (char sourceChar : thatAligned)
+ {
+ if (sourceChar == sourceGap)
+ {
+ sourceGapLength++;
+ continue;
+ }
+
+ /*
+ * Found a residue. Locate its mapped codon (start) position.
+ */
+ sourceDsPos++;
+ // Note mapping positions are base 1, our sequence positions base 0
+ int[] mappedPos = mapping.getMappedRegion(alignTo, alignFrom,
+ sourceDsPos);
+ if (mappedPos == null)
+ {
+ /*
+ * Abort realignment if unmapped protein. Or could ignore it??
+ */
+ System.err.println("Can't align: no codon mapping to residue "
+ + sourceDsPos + "(" + sourceChar + ")");
+ return;
+ }
+
+ int mappedCodonStart = mappedPos[0]; // position (1...) of codon start
+ int mappedCodonEnd = mappedPos[mappedPos.length - 1]; // codon end pos
+ int trailingCopiedGapLength = 0;
+
+ /*
+ * Copy dna sequence up to and including this codon. Optionally, include
+ * gaps before the codon starts (in introns) and/or after the codon starts
+ * (in exons).
+ *
+ * Note this only works for 'linear' splicing, not reverse or interleaved.
+ * But then 'align dna as protein' doesn't make much sense otherwise.
+ */
+ boolean inCodon = false;
+ while (basesWritten < mappedCodonEnd && thisSeqPos < thisSeq.length)
+ {
+ final char c = thisSeq[thisSeqPos++];
+ if (c != myGapChar)
+ {
+ basesWritten++;
+
+ /*
+ * Is this the start of the mapped codon? If so, add in any extra gap
+ * due to the protein alignment.
+ */
+ if (basesWritten == mappedCodonStart)
+ {
+ inCodon = true;
+ int gapsToAdd = Math.max(0, ratio * sourceGapLength
+ - trailingCopiedGapLength);
+ for (int i = 0; i < gapsToAdd; i++)
+ {
+ thisAligned.append(myGapChar);
+ }
+ sourceGapLength = 0;
+ }
+ thisAligned.append(c);
+ trailingCopiedGapLength = 0;
+ }
+ else if ((!inCodon && preserveUnmappedGaps)
+ || (inCodon && preserveMappedGaps))
+ {
+ thisAligned.append(c);
+ trailingCopiedGapLength++;
+ }
+ }
+
+ /*
+ * Expand (if necessary) the trailing gap to the size of the aligned gap.
+ */
+ int gapsToAdd = (ratio * sourceGapLength - trailingCopiedGapLength);
+ for (int i = 0; i < gapsToAdd; i++)
+ {
+ thisAligned.append(myGapChar);
+ }
+ }
+
+ /*
+ * At end of protein sequence. Copy any remaining dna sequence, optionally
+ * including (intron) gaps. We do not copy trailing gaps in protein.
+ */
+ while (thisSeqPos < thisSeq.length)
+ {
+ final char c = thisSeq[thisSeqPos++];
+ if (c != myGapChar || preserveUnmappedGaps)
+ {
+ thisAligned.append(c);
+ }
+ }
+
+ /*
+ * All done aligning, set the aligned sequence.
+ */
+ alignTo.setSequence(new String(thisAligned));
+ }
}