From e132642765c503cb0c93dc47f304007d0527c2cc Mon Sep 17 00:00:00 2001 From: gmungoc Date: Mon, 3 Aug 2020 16:28:15 +0100 Subject: [PATCH] JAL-3700 check for 'covering' mapping when matching CDS to peptides --- src/jalview/datamodel/AlignedCodonFrame.java | 69 ++++++++++++++++- src/jalview/util/MappingUtils.java | 104 ++++++++++++-------------- 2 files changed, 116 insertions(+), 57 deletions(-) diff --git a/src/jalview/datamodel/AlignedCodonFrame.java b/src/jalview/datamodel/AlignedCodonFrame.java index fffa137..7fa8b29 100644 --- a/src/jalview/datamodel/AlignedCodonFrame.java +++ b/src/jalview/datamodel/AlignedCodonFrame.java @@ -107,6 +107,55 @@ public class AlignedCodonFrame { return mapping; } + + /** + * Returns true if the mapping covers the full length of the given sequence. + * This allows us to distinguish the CDS that codes for a protein from + * another overlapping CDS in the parent dna sequence. + * + * @param seq + * @return + */ + public boolean covers(SequenceI seq) + { + List mappedRanges = null; + MapList mapList = mapping.getMap(); + if (fromSeq == seq || fromSeq == seq.getDatasetSequence()) + { + mappedRanges = mapList.getFromRanges(); + } + else if (mapping.to == seq || mapping.to == seq.getDatasetSequence()) + { + mappedRanges = mapList.getToRanges(); + } + else + { + return false; + } + + /* + * check that each mapped range lieS with the sequence range + * (necessary for circular CDS - example EMBL:J03321:AAA91567) + * and mapped length covers (at least) sequence length + */ + int length = 0; + for (int[] range : mappedRanges) + { + int from = Math.min(range[0], range[1]); + int to = Math.max(range[0], range[1]); + if (from < seq.getStart() || to > seq.getEnd()) + { + return false; + } + length += (to - from + 1); + } + // add 1 to mapped length to allow for a mapped stop codon + if (length + 1 < (seq.getEnd() - seq.getStart() + 1)) + { + return false; + } + return true; + } } private List mappings; @@ -261,9 +310,12 @@ public class AlignedCodonFrame } /** + * Return the corresponding aligned or dataset dna sequence for given amino + * acid sequence, or null if not found. returns the sequence from + * the first mapping found that involves the protein sequence. * - * @param sequenceRef - * @return null or corresponding aaSeq entry for dnaSeq entry + * @param aaSeqRef + * @return */ public SequenceI getDnaForAaSeq(SequenceI aaSeqRef) { @@ -783,4 +835,17 @@ public class AlignedCodonFrame { return mappings; } + + public SequenceToSequenceMapping getCoveringMapping(SequenceI cds, + SequenceI peptide) + { + for (SequenceToSequenceMapping mapping : mappings) + { + if (mapping.covers(peptide) && mapping.covers(cds)) + { + return mapping; + } + } + return null; + } } diff --git a/src/jalview/util/MappingUtils.java b/src/jalview/util/MappingUtils.java index 915293e..cf90bf9 100644 --- a/src/jalview/util/MappingUtils.java +++ b/src/jalview/util/MappingUtils.java @@ -28,6 +28,7 @@ import jalview.commands.EditCommand.Action; import jalview.commands.EditCommand.Edit; import jalview.commands.OrderCommand; import jalview.datamodel.AlignedCodonFrame; +import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping; import jalview.datamodel.AlignmentI; import jalview.datamodel.AlignmentOrder; import jalview.datamodel.ColumnSelection; @@ -364,52 +365,45 @@ public final class MappingUtils for (AlignedCodonFrame acf : codonFrames) { - SequenceI mappedSequence = targetIsNucleotide - ? acf.getDnaForAaSeq(selected) - : acf.getAaForDnaSeq(selected); - if (mappedSequence != null) + for (SequenceI seq : mapTo.getAlignment().getSequences()) { - for (SequenceI seq : mapTo.getAlignment().getSequences()) + SequenceI peptide = targetIsNucleotide ? selected : seq; + SequenceI cds = targetIsNucleotide ? seq : selected; + SequenceToSequenceMapping s2s = acf.getCoveringMapping(cds, + peptide); + if (s2s == null) { - int mappedStartResidue = 0; - int mappedEndResidue = 0; - if (seq.getDatasetSequence() == mappedSequence) - { - /* - * Found a sequence mapping. Locate the start/end mapped residues. - */ - List mapping = Arrays - .asList(new AlignedCodonFrame[] - { acf }); - SearchResultsI sr = buildSearchResults(selected, - startResiduePos, mapping); - for (SearchResultMatchI m : sr.getResults()) - { - mappedStartResidue = m.getStart(); - mappedEndResidue = m.getEnd(); - } - sr = buildSearchResults(selected, endResiduePos, mapping); - for (SearchResultMatchI m : sr.getResults()) - { - mappedStartResidue = Math.min(mappedStartResidue, - m.getStart()); - mappedEndResidue = Math.max(mappedEndResidue, m.getEnd()); - } - - /* - * Find the mapped aligned columns, save the range. Note findIndex - * returns a base 1 position, SequenceGroup uses base 0 - */ - int mappedStartCol = seq.findIndex(mappedStartResidue) - 1; - minStartCol = minStartCol == -1 ? mappedStartCol - : Math.min(minStartCol, mappedStartCol); - int mappedEndCol = seq.findIndex(mappedEndResidue) - 1; - maxEndCol = maxEndCol == -1 ? mappedEndCol - : Math.max(maxEndCol, mappedEndCol); - mappedGroup.addSequence(seq, false); - break; - } + continue; + } + int mappedStartResidue = 0; + int mappedEndResidue = 0; + List mapping = Arrays.asList(acf); + SearchResultsI sr = buildSearchResults(selected, startResiduePos, + mapping); + for (SearchResultMatchI m : sr.getResults()) + { + mappedStartResidue = m.getStart(); + mappedEndResidue = m.getEnd(); } + sr = buildSearchResults(selected, endResiduePos, mapping); + for (SearchResultMatchI m : sr.getResults()) + { + mappedStartResidue = Math.min(mappedStartResidue, m.getStart()); + mappedEndResidue = Math.max(mappedEndResidue, m.getEnd()); + } + + /* + * Find the mapped aligned columns, save the range. Note findIndex + * returns a base 1 position, SequenceGroup uses base 0 + */ + int mappedStartCol = seq.findIndex(mappedStartResidue) - 1; + minStartCol = minStartCol == -1 ? mappedStartCol + : Math.min(minStartCol, mappedStartCol); + int mappedEndCol = seq.findIndex(mappedEndResidue) - 1; + maxEndCol = maxEndCol == -1 ? mappedEndCol + : Math.max(maxEndCol, mappedEndCol); + mappedGroup.addSequence(seq, false); + break; } } } @@ -524,7 +518,7 @@ public final class MappingUtils if (colsel == null) { - return; // mappedColumns; + return; } char fromGapChar = mapFrom.getAlignment().getGapCharacter(); @@ -546,10 +540,9 @@ public final class MappingUtils while (regions.hasNext()) { mapHiddenColumns(regions.next(), codonFrames, newHidden, - fromSequences, - toSequences, fromGapChar); + fromSequences, toSequences, fromGapChar); } - return; // mappedColumns; + return; } /** @@ -667,7 +660,9 @@ public final class MappingUtils */ for (SequenceI toSeq : toSequences) { - if (toSeq.getDatasetSequence() == mappedSeq) + if (toSeq.getDatasetSequence() == mappedSeq + && mappedStartResidue >= toSeq.getStart() + && mappedEndResidue <= toSeq.getEnd()) { int mappedStartCol = toSeq.findIndex(mappedStartResidue); int mappedEndCol = toSeq.findIndex(mappedEndResidue); @@ -965,7 +960,7 @@ public final class MappingUtils int min = Math.min(range[0], range[1]); int max = Math.max(range[0], range[1]); - + return (min <= queryRange[0] && max >= queryRange[0] && min <= queryRange[1] && max >= queryRange[1]); } @@ -980,8 +975,7 @@ public final class MappingUtils * a list of (single) [start, end] ranges * @return */ - public static void removeEndPositions(int positions, - List ranges) + public static void removeEndPositions(int positions, List ranges) { int toRemove = positions; Iterator it = new ReverseListIterator<>(ranges); @@ -993,8 +987,8 @@ public final class MappingUtils /* * not coded for [start1, end1, start2, end2, ...] */ - System.err - .println("MappingUtils.removeEndPositions doesn't handle multiple ranges"); + System.err.println( + "MappingUtils.removeEndPositions doesn't handle multiple ranges"); return; } @@ -1004,8 +998,8 @@ public final class MappingUtils /* * not coded for a reverse strand range (end < start) */ - System.err - .println("MappingUtils.removeEndPositions doesn't handle reverse strand"); + System.err.println( + "MappingUtils.removeEndPositions doesn't handle reverse strand"); return; } if (length > toRemove) -- 1.7.10.2