/*
- * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.2)
- * Copyright (C) 2014 The Jalview Authors
+ * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
+ * Copyright (C) $$Year-Rel$$ The Jalview Authors
*
* This file is part of Jalview.
*
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
{
/**
- * Represents the 3 possible results of trying to map one alignment to
- * another.
- */
- public enum MappingResult
- {
- Mapped, NotMapped, AlreadyMapped
- }
-
- /**
* given an existing alignment, create a new alignment including all, or up to
* flankSize additional symbols from each sequence's dataset sequence
*
/**
* Build mapping of protein to cDNA alignment. Mappings are made between
* sequences where the cDNA translates to the protein sequence. Any new
- * mappings are added to the protein alignment. Has a 3-valued result: either
- * Mapped (at least one sequence mapping was created), AlreadyMapped (all
- * possible sequence mappings already exist), or NotMapped (no possible
- * sequence mappings exist).
+ * mappings are added to the protein alignment. Returns true if any mappings
+ * either already exist or were added, else false.
*
* @param proteinAlignment
* @param cdnaAlignment
* @return
*/
- public static MappingResult mapProteinToCdna(
+ public static boolean mapProteinToCdna(
final AlignmentI proteinAlignment,
final AlignmentI cdnaAlignment)
{
if (proteinAlignment == null || cdnaAlignment == null)
{
- return MappingResult.NotMapped;
+ return false;
}
- boolean mappingPossible = false;
- boolean mappingPerformed = false;
+ Set<SequenceI> mappedDna = new HashSet<SequenceI>();
+ Set<SequenceI> mappedProtein = new HashSet<SequenceI>();
+
+ /*
+ * First pass - map sequences where cross-references exist. This include
+ * 1-to-many mappings to support, for example, variant cDNA.
+ */
+ boolean mappingPerformed = mapProteinToCdna(proteinAlignment,
+ cdnaAlignment, mappedDna, mappedProtein, true);
- List<SequenceI> mapped = new ArrayList<SequenceI>();
+ /*
+ * Second pass - map sequences where no cross-references exist. This only
+ * does 1-to-1 mappings and assumes corresponding sequences are in the same
+ * order in the alignments.
+ */
+ mappingPerformed |= mapProteinToCdna(proteinAlignment, cdnaAlignment,
+ mappedDna, mappedProtein, false);
+ return mappingPerformed;
+ }
+ /**
+ * Make mappings between compatible sequences (where the cDNA translation
+ * matches the protein).
+ *
+ * @param proteinAlignment
+ * @param cdnaAlignment
+ * @param mappedDna
+ * a set of mapped DNA sequences (to add to)
+ * @param mappedProtein
+ * a set of mapped Protein sequences (to add to)
+ * @param xrefsOnly
+ * if true, only map sequences where xrefs exist
+ * @return
+ */
+ protected static boolean mapProteinToCdna(
+ final AlignmentI proteinAlignment,
+ final AlignmentI cdnaAlignment, Set<SequenceI> mappedDna,
+ Set<SequenceI> mappedProtein, boolean xrefsOnly)
+ {
+ boolean mappingPerformed = false;
List<SequenceI> thisSeqs = proteinAlignment.getSequences();
-
for (SequenceI aaSeq : thisSeqs)
{
+ boolean proteinMapped = false;
AlignedCodonFrame acf = new AlignedCodonFrame();
for (SequenceI cdnaSeq : cdnaAlignment.getSequences())
{
/*
- * Heuristic rule: don't map more than one AA sequence to the same cDNA;
- * map progressively assuming that alignments have mappable sequences in
- * the same respective order
+ * Always try to map if sequences have xref to each other; this supports
+ * variant cDNA or alternative splicing for a protein sequence.
+ *
+ * If no xrefs, try to map progressively, assuming that alignments have
+ * mappable sequences in corresponding order. These are not
+ * many-to-many, as that would risk mixing species with similar cDNA
+ * sequences.
*/
- if (mapped.contains(cdnaSeq))
+ if (xrefsOnly && !CrossRef.haveCrossRef(aaSeq, cdnaSeq))
+ {
+ continue;
+ }
+
+ /*
+ * Don't map non-xrefd sequences more than once each. This heuristic
+ * allows us to pair up similar sequences in ordered alignments.
+ */
+ if (!xrefsOnly
+ && (mappedProtein.contains(aaSeq) || mappedDna
+ .contains(cdnaSeq)))
{
continue;
}
{
acf.addMap(cdnaSeq, aaSeq, map);
mappingPerformed = true;
- mapped.add(cdnaSeq);
-
- /*
- * Heuristic rule #2: don't map AA sequence to more than one cDNA
- */
- break;
+ proteinMapped = true;
+ mappedDna.add(cdnaSeq);
+ mappedProtein.add(aaSeq);
}
}
}
- proteinAlignment.addCodonFrame(acf);
- }
-
- /*
- * If at least one mapping was possible but none was done, then the
- * alignments are already as mapped as they can be.
- */
- if (mappingPossible && !mappingPerformed)
- {
- return MappingResult.AlreadyMapped;
- }
- else
- {
- return mappingPerformed ? MappingResult.Mapped
- : MappingResult.NotMapped;
+ if (proteinMapped)
+ {
+ proteinAlignment.addCodonFrame(acf);
+ }
}
+ return mappingPerformed;
}
/**
{
int aaResidue = 0;
for (int i = cdnaStart; i < cdnaSeqChars.length - 2
- && aaResidue < aaSeqChars.length; i += 3)
+ && aaResidue < aaSeqChars.length; i += 3, aaResidue++)
{
String codon = String.valueOf(cdnaSeqChars, i, 3);
final String translated = ResidueProperties.codonTranslate(
codon);
+ /*
+ * ? allow X in protein to match untranslatable in dna ?
+ */
+ final char aaRes = aaSeqChars[aaResidue];
+ if (translated == null && aaRes == 'X')
+ {
+ continue;
+ }
if (translated == null
- || !(aaSeqChars[aaResidue] == translated.charAt(0)))
+ || !(aaRes == translated.charAt(0)))
{
+ // debug
+ System.out.println(("Mismatch at " + i + "/" + aaResidue + ": "
+ + codon + "(" + translated + ") != " + aaRes));
return false;
}
- aaResidue++;
}
// fail if we didn't match all of the aa sequence
return (aaResidue == aaSeqChars.length);