import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
-import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
* Answers true if the mappings include one between the given (dataset)
* sequences.
*/
- public static boolean mappingExists(Set<AlignedCodonFrame> set,
+ public static boolean mappingExists(List<AlignedCodonFrame> mappings,
SequenceI aaSeq, SequenceI cdnaSeq)
{
- if (set != null)
+ if (mappings != null)
{
- for (AlignedCodonFrame acf : set)
+ for (AlignedCodonFrame acf : mappings)
{
if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
{
/*
* Locate the aligned source sequence whose dataset sequence is mapped. We
- * just take the first match here (as we can't align cDNA like more than one
- * protein sequence).
+ * just take the first match here (as we can't align like more than one
+ * sequence).
*/
SequenceI alignFrom = null;
AlignedCodonFrame mapping = null;
/**
* Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to
* match residues and codons. Flags control whether existing gaps in unmapped
- * (intron) and mapped (exon) regions are preserved or not. Gaps linking intro
- * and exon are only retained if both flags are set.
+ * (intron) and mapped (exon) regions are preserved or not. Gaps between
+ * intron and exon are only retained if both flags are set.
*
* @param alignTo
* @param alignFrom
boolean preserveUnmappedGaps)
{
// TODO generalise to work for Protein-Protein, dna-dna, dna-protein
- final char[] thisSeq = alignTo.getSequence();
- final char[] thatAligned = alignFrom.getSequence();
- StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length);
// aligned and dataset sequence positions, all base zero
int thisSeqPos = 0;
char myGapChar = myGap.charAt(0);
int ratio = myGap.length();
- /*
- * Traverse the aligned protein sequence.
- */
int fromOffset = alignFrom.getStart() - 1;
int toOffset = alignTo.getStart() - 1;
int sourceGapMappedLength = 0;
boolean inExon = false;
+ final char[] thisSeq = alignTo.getSequence();
+ final char[] thatAligned = alignFrom.getSequence();
+ StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length);
+
+ /*
+ * Traverse the 'model' aligned sequence
+ */
for (char sourceChar : thatAligned)
{
if (sourceChar == sourceGap)
}
/*
- * Found a residue. Locate its mapped codon (start) position.
+ * Found a non-gap character. Locate its mapped region if any.
*/
sourceDsPos++;
// Note mapping positions are base 1, our sequence positions base 0
if (mappedPos == null)
{
/*
- * Abort realignment if unmapped protein. Or could ignore it??
+ * unmapped position; treat like a gap
*/
- System.err.println("Can't align: no codon mapping to residue "
- + sourceDsPos + "(" + sourceChar + ")");
- return;
+ sourceGapMappedLength += ratio;
+ // System.err.println("Can't align: no codon mapping to residue "
+ // + sourceDsPos + "(" + sourceChar + ")");
+ // return;
+ continue;
}
int mappedCodonStart = mappedPos[0]; // position (1...) of codon start
}
/*
- * At end of protein sequence. Copy any remaining dna sequence, optionally
- * including (intron) gaps. We do not copy trailing gaps in protein.
+ * At end of model aligned sequence. Copy any remaining target sequence, optionally
+ * including (intron) gaps.
*/
while (thisSeqPos < thisSeq.length)
{
{
thisAligned.append(c);
}
+ sourceGapMappedLength--;
+ }
+
+ /*
+ * finally add gaps to pad for any trailing source gaps or
+ * unmapped characters
+ */
+ if (preserveUnmappedGaps)
+ {
+ while (sourceGapMappedLength > 0)
+ {
+ thisAligned.append(myGapChar);
+ sourceGapMappedLength--;
+ }
}
/*
List<SequenceI> unmappedProtein = new ArrayList<SequenceI>();
unmappedProtein.addAll(protein.getSequences());
- Set<AlignedCodonFrame> mappings = protein.getCodonFrames();
+ List<AlignedCodonFrame> mappings = protein.getCodonFrames();
/*
* Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of
}
AlignmentI dna = al1.isNucleotide() ? al1 : al2;
AlignmentI protein = dna == al1 ? al2 : al1;
- Set<AlignedCodonFrame> mappings = protein.getCodonFrames();
+ List<AlignedCodonFrame> mappings = protein.getCodonFrames();
for (SequenceI dnaSeq : dna.getSequences())
{
for (SequenceI proteinSeq : protein.getSequences())
* @return
*/
protected static boolean isMappable(SequenceI dnaSeq,
- SequenceI proteinSeq, Set<AlignedCodonFrame> mappings)
+ SequenceI proteinSeq, List<AlignedCodonFrame> mappings)
{
if (dnaSeq == null || proteinSeq == null)
{
* sequences (or null if no exons are found)
*/
public static AlignmentI makeExonAlignment(SequenceI[] dna,
- Set<AlignedCodonFrame> mappings)
+ List<AlignedCodonFrame> mappings)
{
- Set<AlignedCodonFrame> newMappings = new LinkedHashSet<AlignedCodonFrame>();
+ List<AlignedCodonFrame> newMappings = new ArrayList<AlignedCodonFrame>();
List<SequenceI> exonSequences = new ArrayList<SequenceI>();
for (SequenceI dnaSeq : dna)