{
/**
+ * Represents the 3 possible results of trying to map one alignment to
+ * another.
+ */
+ public enum MappingResult
+ {
+ Mapped, NotMapped, AlreadyMapped
+ }
+
+ /**
* given an existing alignment, create a new alignment including all, or up to
* flankSize additional symbols from each sequence's dataset sequence
*
/**
* Build mapping of protein to cDNA alignment. Mappings are made between
- * sequences which have the same name and compatible lengths. Returns true if
- * at least one sequence mapping was made, else false.
+ * sequences which have the same name and compatible lengths. Has a 3-valued
+ * result: either Mapped (at least one sequence mapping was created),
+ * AlreadyMapped (all possible sequence mappings already exist), or NotMapped
+ * (no possible sequence mappings exist).
*
* @param proteinAlignment
* @param cdnaAlignment
* @return
*/
- public static boolean mapProteinToCdna(final AlignmentI proteinAlignment,
+ public static MappingResult mapProteinToCdna(
+ final AlignmentI proteinAlignment,
final AlignmentI cdnaAlignment)
{
- boolean mapped = false;
+ boolean mappingPossible = false;
+ boolean mappingPerformed = false;
+
List<SequenceI> thisSeqs = proteinAlignment.getSequences();
/*
for (SequenceI aaSeq : thisSeqs)
{
- AlignedCodonFrame acf = new AlignedCodonFrame(
- proteinAlignment.getWidth());
+ AlignedCodonFrame acf = new AlignedCodonFrame();
List<SequenceI> candidates = cdnaSeqs.get(aaSeq.getName());
if (candidates == null)
{
/*
- * No cDNA sequence with matching name, so no mapping for this protein
- * sequence
+ * No cDNA sequence with matching name, so no mapping possible for this
+ * protein sequence
*/
continue;
}
+ mappingPossible = true;
for (SequenceI cdnaSeq : candidates)
{
- MapList map = mapProteinToCdna(aaSeq, cdnaSeq);
- if (map != null)
+ if (!mappingExists(proteinAlignment.getCodonFrames(),
+ aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence()))
{
- acf.addMap(cdnaSeq, aaSeq, map);
- mapped = true;
+ MapList map = mapProteinToCdna(aaSeq, cdnaSeq);
+ if (map != null)
+ {
+ acf.addMap(cdnaSeq, aaSeq, map);
+ mappingPerformed = true;
+ }
}
}
proteinAlignment.addCodonFrame(acf);
}
- return mapped;
+
+ /*
+ * If at least one mapping was possible but none was done, then the
+ * alignments are already as mapped as they can be.
+ */
+ if (mappingPossible && !mappingPerformed)
+ {
+ return MappingResult.AlreadyMapped;
+ }
+ else
+ {
+ return mappingPerformed ? MappingResult.Mapped
+ : MappingResult.NotMapped;
+ }
+ }
+
+ /**
+ * Answers true if the mappings include one between the given (dataset)
+ * sequences.
+ */
+ public static boolean mappingExists(AlignedCodonFrame[] codonFrames,
+ SequenceI aaSeq, SequenceI cdnaSeq)
+ {
+ if (codonFrames != null)
+ {
+ for (AlignedCodonFrame acf : codonFrames)
+ {
+ if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
+ {
+ return true;
+ }
+ }
+ }
+ return false;
}
/**
return null;
}
}
+
+ /**
+ * Align sequence 'seq' to match the alignment of a mapped sequence. Note this
+ * currently assumes that we are aligning cDNA to match protein.
+ *
+ * @param seq
+ * the sequence to be realigned
+ * @param al
+ * the alignment whose sequence alignment is to be 'copied'
+ * @param gap
+ * character string represent a gap in the realigned sequence
+ * @param preserveUnmappedGaps
+ * @param preserveMappedGaps
+ * @return true if the sequence was realigned, false if it could not be
+ */
+ public static boolean alignSequenceAs(SequenceI seq, AlignmentI al,
+ String gap, boolean preserveMappedGaps,
+ boolean preserveUnmappedGaps)
+ {
+ /*
+ * Get any mappings from the source alignment to the target (dataset) sequence.
+ */
+ // TODO there may be one AlignedCodonFrame per dataset sequence, or one with
+ // all mappings. Would it help to constrain this?
+ AlignedCodonFrame[] mappings = al.getCodonFrame(seq);
+ if (mappings == null)
+ {
+ return false;
+ }
+
+ /*
+ * Locate the aligned source sequence whose dataset sequence is mapped. We
+ * just take the first match here (as we can't align cDNA like more than one
+ * protein sequence).
+ */
+ SequenceI alignFrom = null;
+ AlignedCodonFrame mapping = null;
+ for (AlignedCodonFrame mp : mappings)
+ {
+ alignFrom = mp.findAlignedSequence(seq.getDatasetSequence(), al);
+ if (alignFrom != null)
+ {
+ mapping = mp;
+ break;
+ }
+ }
+
+ if (alignFrom == null)
+ {
+ return false;
+ }
+ alignSequenceAs(seq, alignFrom, mapping, gap, al.getGapCharacter(),
+ preserveMappedGaps, preserveUnmappedGaps);
+ return true;
+ }
+
+ /**
+ * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to
+ * match residues and codons.
+ *
+ * @param alignTo
+ * @param alignFrom
+ * @param mapping
+ * @param myGap
+ * @param sourceGap
+ * @param preserveUnmappedGaps
+ * @param preserveMappedGaps
+ */
+ public static void alignSequenceAs(SequenceI alignTo,
+ SequenceI alignFrom,
+ AlignedCodonFrame mapping, String myGap, char sourceGap,
+ boolean preserveMappedGaps, boolean preserveUnmappedGaps)
+ {
+ // TODO generalise to work for Protein-Protein, dna-dna, dna-protein
+ final char[] thisSeq = alignTo.getSequence();
+ final char[] thatAligned = alignFrom.getSequence();
+ StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length);
+
+ // aligned and dataset sequence positions, all base zero
+ int thisSeqPos = 0;
+ int sourceDsPos = 0;
+
+ int basesWritten = 0;
+ char myGapChar = myGap.charAt(0);
+ int ratio = myGap.length();
+
+ /*
+ * Traverse the aligned protein sequence.
+ */
+ int sourceGapLength = 0;
+ for (char sourceChar : thatAligned)
+ {
+ if (sourceChar == sourceGap)
+ {
+ sourceGapLength++;
+ continue;
+ }
+
+ /*
+ * Found a residue. Locate its mapped codon (start) position.
+ */
+ sourceDsPos++;
+ // Note mapping positions are base 1, our sequence positions base 0
+ int[] mappedPos = mapping.getMappedRegion(alignTo, alignFrom,
+ sourceDsPos);
+ if (mappedPos == null)
+ {
+ /*
+ * Abort realignment if unmapped protein. Or could ignore it??
+ */
+ System.err.println("Can't align: no codon mapping to residue "
+ + sourceDsPos + "(" + sourceChar + ")");
+ return;
+ }
+
+ int mappedCodonStart = mappedPos[0]; // position (1...) of codon start
+ int mappedCodonEnd = mappedPos[mappedPos.length - 1]; // codon end pos
+ int trailingCopiedGapLength = 0;
+
+ /*
+ * Copy dna sequence up to and including this codon. Optionally, include
+ * gaps before the codon starts (in introns) and/or after the codon starts
+ * (in exons).
+ *
+ * Note this only works for 'linear' splicing, not reverse or interleaved.
+ * But then 'align dna as protein' doesn't make much sense otherwise.
+ */
+ boolean inCodon = false;
+ while (basesWritten < mappedCodonEnd && thisSeqPos < thisSeq.length)
+ {
+ final char c = thisSeq[thisSeqPos++];
+ if (c != myGapChar)
+ {
+ basesWritten++;
+
+ /*
+ * Is this the start of the mapped codon? If so, add in any extra gap
+ * due to the protein alignment.
+ */
+ if (basesWritten == mappedCodonStart)
+ {
+ inCodon = true;
+ int gapsToAdd = Math.max(0, ratio * sourceGapLength
+ - trailingCopiedGapLength);
+ for (int i = 0; i < gapsToAdd; i++)
+ {
+ thisAligned.append(myGapChar);
+ }
+ sourceGapLength = 0;
+ }
+ thisAligned.append(c);
+ trailingCopiedGapLength = 0;
+ }
+ else if ((!inCodon && preserveUnmappedGaps)
+ || (inCodon && preserveMappedGaps))
+ {
+ thisAligned.append(c);
+ trailingCopiedGapLength++;
+ }
+ }
+
+ /*
+ * Expand (if necessary) the trailing gap to the size of the aligned gap.
+ */
+ int gapsToAdd = (ratio * sourceGapLength - trailingCopiedGapLength);
+ for (int i = 0; i < gapsToAdd; i++)
+ {
+ thisAligned.append(myGapChar);
+ }
+ }
+
+ /*
+ * At end of protein sequence. Copy any remaining dna sequence, optionally
+ * including (intron) gaps. We do not copy trailing gaps in protein.
+ */
+ while (thisSeqPos < thisSeq.length)
+ {
+ final char c = thisSeq[thisSeqPos++];
+ if (c != myGapChar || preserveUnmappedGaps)
+ {
+ thisAligned.append(c);
+ }
+ }
+
+ /*
+ * All done aligning, set the aligned sequence.
+ */
+ alignTo.setSequence(new String(thisAligned));
+ }
}