2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import jalview.analysis.AlignmentSorter;
24 import jalview.api.AlignViewportI;
25 import jalview.commands.CommandI;
26 import jalview.commands.EditCommand;
27 import jalview.commands.EditCommand.Action;
28 import jalview.commands.EditCommand.Edit;
29 import jalview.commands.OrderCommand;
30 import jalview.datamodel.AlignedCodonFrame;
31 import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
32 import jalview.datamodel.AlignmentI;
33 import jalview.datamodel.AlignmentOrder;
34 import jalview.datamodel.ColumnSelection;
35 import jalview.datamodel.HiddenColumns;
36 import jalview.datamodel.SearchResultMatchI;
37 import jalview.datamodel.SearchResults;
38 import jalview.datamodel.SearchResultsI;
39 import jalview.datamodel.Sequence;
40 import jalview.datamodel.SequenceGroup;
41 import jalview.datamodel.SequenceI;
43 import java.util.ArrayList;
44 import java.util.Arrays;
45 import java.util.HashMap;
46 import java.util.Iterator;
47 import java.util.List;
51 * Helper methods for manipulations involving sequence mappings.
56 public final class MappingUtils
60 * Helper method to map a CUT or PASTE command.
63 * the original command
65 * if true, the command is to be undone
67 * the mapped sequences to apply the mapped command to
69 * the mapped EditCommand to add to
72 protected static void mapCutOrPaste(Edit edit, boolean undo,
73 List<SequenceI> targetSeqs, EditCommand result,
74 List<AlignedCodonFrame> mappings)
76 Action action = edit.getAction();
79 action = action.getUndoAction();
82 System.err.println("MappingUtils.mapCutOrPaste not yet implemented");
86 * Returns a new EditCommand representing the given command as mapped to the
87 * given sequences. If there is no mapping, returns null.
96 public static EditCommand mapEditCommand(EditCommand command,
97 boolean undo, final AlignmentI mapTo, char gapChar,
98 List<AlignedCodonFrame> mappings)
101 * For now, only support mapping from protein edits to cDna
103 if (!mapTo.isNucleotide())
109 * Cache a copy of the target sequences so we can mimic successive edits on
110 * them. This lets us compute mappings for all edits in the set.
112 Map<SequenceI, SequenceI> targetCopies = new HashMap<>();
113 for (SequenceI seq : mapTo.getSequences())
115 SequenceI ds = seq.getDatasetSequence();
118 final SequenceI copy = new Sequence(seq);
119 copy.setDatasetSequence(ds);
120 targetCopies.put(ds, copy);
125 * Compute 'source' sequences as they were before applying edits:
127 Map<SequenceI, SequenceI> originalSequences = command.priorState(undo);
129 EditCommand result = new EditCommand();
130 Iterator<Edit> edits = command.getEditIterator(!undo);
131 while (edits.hasNext())
133 Edit edit = edits.next();
134 if (edit.getAction() == Action.CUT
135 || edit.getAction() == Action.PASTE)
137 mapCutOrPaste(edit, undo, mapTo.getSequences(), result, mappings);
139 else if (edit.getAction() == Action.INSERT_GAP
140 || edit.getAction() == Action.DELETE_GAP)
142 mapInsertOrDelete(edit, undo, originalSequences,
143 mapTo.getSequences(), targetCopies, gapChar, result,
147 return result.getSize() > 0 ? result : null;
151 * Helper method to map an edit command to insert or delete gaps.
154 * the original command
156 * if true, the action is to undo the command
157 * @param originalSequences
158 * the sequences the command acted on
160 * @param targetCopies
163 * the new EditCommand to add mapped commands to
166 protected static void mapInsertOrDelete(Edit edit, boolean undo,
167 Map<SequenceI, SequenceI> originalSequences,
168 final List<SequenceI> targetSeqs,
169 Map<SequenceI, SequenceI> targetCopies, char gapChar,
170 EditCommand result, List<AlignedCodonFrame> mappings)
172 Action action = edit.getAction();
175 * Invert sense of action if an Undo.
179 action = action.getUndoAction();
181 final int count = edit.getNumber();
182 final int editPos = edit.getPosition();
183 for (SequenceI seq : edit.getSequences())
186 * Get residue position at (or to right of) edit location. Note we use our
187 * 'copy' of the sequence before editing for this.
189 SequenceI ds = seq.getDatasetSequence();
194 final SequenceI actedOn = originalSequences.get(ds);
195 final int seqpos = actedOn.findPosition(editPos);
198 * Determine all mappings from this position to mapped sequences.
200 SearchResultsI sr = buildSearchResults(seq, seqpos, mappings);
204 for (SequenceI targetSeq : targetSeqs)
206 ds = targetSeq.getDatasetSequence();
211 SequenceI copyTarget = targetCopies.get(ds);
212 final int[] match = sr.getResults(copyTarget, 0,
213 copyTarget.getLength());
216 final int ratio = 3; // TODO: compute this - how?
217 final int mappedCount = count * ratio;
220 * Shift Delete start position left, as it acts on positions to its
223 int mappedEditPos = action == Action.DELETE_GAP
224 ? match[0] - mappedCount
226 Edit e = result.new Edit(action, new SequenceI[] { targetSeq },
227 mappedEditPos, mappedCount, gapChar);
231 * and 'apply' the edit to our copy of its target sequence
233 if (action == Action.INSERT_GAP)
235 copyTarget.setSequence(new String(
236 StringUtils.insertCharAt(copyTarget.getSequence(),
237 mappedEditPos, mappedCount, gapChar)));
239 else if (action == Action.DELETE_GAP)
241 copyTarget.setSequence(new String(
242 StringUtils.deleteChars(copyTarget.getSequence(),
243 mappedEditPos, mappedEditPos + mappedCount)));
249 * and 'apply' the edit to our copy of its source sequence
251 if (action == Action.INSERT_GAP)
253 actedOn.setSequence(new String(StringUtils.insertCharAt(
254 actedOn.getSequence(), editPos, count, gapChar)));
256 else if (action == Action.DELETE_GAP)
258 actedOn.setSequence(new String(StringUtils.deleteChars(
259 actedOn.getSequence(), editPos, editPos + count)));
265 * Returns a SearchResults object describing the mapped region corresponding
266 * to the specified sequence position.
273 public static SearchResultsI buildSearchResults(SequenceI seq, int index,
274 List<AlignedCodonFrame> seqmappings)
276 SearchResultsI results = new SearchResults();
277 addSearchResults(results, seq, index, seqmappings);
282 * Adds entries to a SearchResults object describing the mapped region
283 * corresponding to the specified sequence position.
290 public static void addSearchResults(SearchResultsI results, SequenceI seq,
291 int index, List<AlignedCodonFrame> seqmappings)
293 if (index >= seq.getStart() && index <= seq.getEnd())
295 for (AlignedCodonFrame acf : seqmappings)
297 acf.markMappedRegion(seq, index, results);
303 * Returns a (possibly empty) SequenceGroup containing any sequences in the
304 * mapped viewport corresponding to the given group in the source viewport.
311 public static SequenceGroup mapSequenceGroup(final SequenceGroup sg,
312 final AlignViewportI mapFrom, final AlignViewportI mapTo)
315 * Note the SequenceGroup holds aligned sequences, the mappings hold dataset
318 boolean targetIsNucleotide = mapTo.isNucleotide();
319 AlignViewportI protein = targetIsNucleotide ? mapFrom : mapTo;
320 List<AlignedCodonFrame> codonFrames = protein.getAlignment()
323 * Copy group name, colours etc, but not sequences or sequence colour scheme
325 SequenceGroup mappedGroup = new SequenceGroup(sg);
326 mappedGroup.setColourScheme(mapTo.getGlobalColourScheme());
329 int minStartCol = -1;
331 final int selectionStartRes = sg.getStartRes();
332 final int selectionEndRes = sg.getEndRes();
333 for (SequenceI selected : sg.getSequences())
336 * Find the widest range of non-gapped positions in the selection range
338 int firstUngappedPos = selectionStartRes;
339 while (firstUngappedPos <= selectionEndRes
340 && Comparison.isGap(selected.getCharAt(firstUngappedPos)))
346 * If this sequence is only gaps in the selected range, skip it
348 if (firstUngappedPos > selectionEndRes)
353 int lastUngappedPos = selectionEndRes;
354 while (lastUngappedPos >= selectionStartRes
355 && Comparison.isGap(selected.getCharAt(lastUngappedPos)))
361 * Find the selected start/end residue positions in sequence
363 int startResiduePos = selected.findPosition(firstUngappedPos);
364 int endResiduePos = selected.findPosition(lastUngappedPos);
366 for (AlignedCodonFrame acf : codonFrames)
368 for (SequenceI seq : mapTo.getAlignment().getSequences())
370 SequenceI peptide = targetIsNucleotide ? selected : seq;
371 SequenceI cds = targetIsNucleotide ? seq : selected;
372 SequenceToSequenceMapping s2s = acf.getCoveringMapping(cds,
378 int mappedStartResidue = 0;
379 int mappedEndResidue = 0;
380 List<AlignedCodonFrame> mapping = Arrays.asList(acf);
381 SearchResultsI sr = buildSearchResults(selected, startResiduePos,
383 for (SearchResultMatchI m : sr.getResults())
385 mappedStartResidue = m.getStart();
386 mappedEndResidue = m.getEnd();
388 sr = buildSearchResults(selected, endResiduePos, mapping);
389 for (SearchResultMatchI m : sr.getResults())
391 mappedStartResidue = Math.min(mappedStartResidue, m.getStart());
392 mappedEndResidue = Math.max(mappedEndResidue, m.getEnd());
396 * Find the mapped aligned columns, save the range. Note findIndex
397 * returns a base 1 position, SequenceGroup uses base 0
399 int mappedStartCol = seq.findIndex(mappedStartResidue) - 1;
400 minStartCol = minStartCol == -1 ? mappedStartCol
401 : Math.min(minStartCol, mappedStartCol);
402 int mappedEndCol = seq.findIndex(mappedEndResidue) - 1;
403 maxEndCol = maxEndCol == -1 ? mappedEndCol
404 : Math.max(maxEndCol, mappedEndCol);
405 mappedGroup.addSequence(seq, false);
410 mappedGroup.setStartRes(minStartCol < 0 ? 0 : minStartCol);
411 mappedGroup.setEndRes(maxEndCol < 0 ? 0 : maxEndCol);
416 * Returns an OrderCommand equivalent to the given one, but acting on mapped
417 * sequences as described by the mappings, or null if no mapping can be made.
420 * the original order command
422 * if true, the action is to undo the sort
424 * the alignment we are mapping to
426 * the mappings available
429 public static CommandI mapOrderCommand(OrderCommand command, boolean undo,
430 AlignmentI mapTo, List<AlignedCodonFrame> mappings)
432 SequenceI[] sortOrder = command.getSequenceOrder(undo);
433 List<SequenceI> mappedOrder = new ArrayList<>();
437 * Assumption: we are only interested in a cDNA/protein mapping; refactor in
438 * future if we want to support sorting (c)dna as (c)dna or protein as
441 boolean mappingToNucleotide = mapTo.isNucleotide();
442 for (SequenceI seq : sortOrder)
444 for (AlignedCodonFrame acf : mappings)
446 SequenceI mappedSeq = mappingToNucleotide ? acf.getDnaForAaSeq(seq)
447 : acf.getAaForDnaSeq(seq);
448 if (mappedSeq != null)
450 for (SequenceI seq2 : mapTo.getSequences())
452 if (seq2.getDatasetSequence() == mappedSeq)
454 mappedOrder.add(seq2);
464 * Return null if no mappings made.
472 * Add any unmapped sequences on the end of the sort in their original
475 if (j < mapTo.getHeight())
477 for (SequenceI seq : mapTo.getSequences())
479 if (!mappedOrder.contains(seq))
481 mappedOrder.add(seq);
487 * Have to sort the sequences before constructing the OrderCommand - which
488 * then resorts them?!?
490 final SequenceI[] mappedOrderArray = mappedOrder
491 .toArray(new SequenceI[mappedOrder.size()]);
492 SequenceI[] oldOrder = mapTo.getSequencesArray();
493 AlignmentSorter.sortBy(mapTo, new AlignmentOrder(mappedOrderArray));
494 final OrderCommand result = new OrderCommand(command.getDescription(),
500 * Returns a ColumnSelection in the 'mapTo' view which corresponds to the
501 * given selection in the 'mapFrom' view. We assume one is nucleotide, the
502 * other is protein (and holds the mappings from codons to protein residues).
509 public static void mapColumnSelection(ColumnSelection colsel,
510 HiddenColumns hiddencols, AlignViewportI mapFrom,
511 AlignViewportI mapTo, ColumnSelection newColSel,
512 HiddenColumns newHidden)
514 boolean targetIsNucleotide = mapTo.isNucleotide();
515 AlignViewportI protein = targetIsNucleotide ? mapFrom : mapTo;
516 List<AlignedCodonFrame> codonFrames = protein.getAlignment()
524 char fromGapChar = mapFrom.getAlignment().getGapCharacter();
527 * For each mapped column, find the range of columns that residues in that
530 List<SequenceI> fromSequences = mapFrom.getAlignment().getSequences();
531 List<SequenceI> toSequences = mapTo.getAlignment().getSequences();
533 for (Integer sel : colsel.getSelected())
535 mapColumn(sel.intValue(), codonFrames, newColSel, fromSequences,
536 toSequences, fromGapChar);
539 Iterator<int[]> regions = hiddencols.iterator();
540 while (regions.hasNext())
542 mapHiddenColumns(regions.next(), codonFrames, newHidden,
543 fromSequences, toSequences, fromGapChar);
549 * Helper method that maps a [start, end] hidden column range to its mapped
554 * @param mappedColumns
555 * @param fromSequences
559 protected static void mapHiddenColumns(int[] hidden,
560 List<AlignedCodonFrame> mappings, HiddenColumns mappedColumns,
561 List<SequenceI> fromSequences, List<SequenceI> toSequences,
564 for (int col = hidden[0]; col <= hidden[1]; col++)
566 int[] mappedTo = findMappedColumns(col, mappings, fromSequences,
567 toSequences, fromGapChar);
570 * Add the range of hidden columns to the mapped selection (converting
573 if (mappedTo != null)
575 mappedColumns.hideColumns(mappedTo[0] - 1, mappedTo[1] - 1);
581 * Helper method to map one column selection
584 * the column number (base 0)
586 * the sequence mappings
587 * @param mappedColumns
588 * the mapped column selections to add to
589 * @param fromSequences
593 protected static void mapColumn(int col, List<AlignedCodonFrame> mappings,
594 ColumnSelection mappedColumns, List<SequenceI> fromSequences,
595 List<SequenceI> toSequences, char fromGapChar)
597 int[] mappedTo = findMappedColumns(col, mappings, fromSequences,
598 toSequences, fromGapChar);
601 * Add the range of mapped columns to the mapped selection (converting
602 * base 1 to base 0). Note that this may include intron-only regions which
603 * lie between the start and end ranges of the selection.
605 if (mappedTo != null)
607 for (int i = mappedTo[0]; i <= mappedTo[1]; i++)
609 mappedColumns.addElement(i - 1);
615 * Helper method to find the range of columns mapped to from one column.
616 * Returns the maximal range of columns mapped to from all sequences in the
617 * source column, or null if no mappings were found.
621 * @param fromSequences
626 protected static int[] findMappedColumns(int col,
627 List<AlignedCodonFrame> mappings, List<SequenceI> fromSequences,
628 List<SequenceI> toSequences, char fromGapChar)
630 int[] mappedTo = new int[] { Integer.MAX_VALUE, Integer.MIN_VALUE };
631 boolean found = false;
634 * For each sequence in the 'from' alignment
636 for (SequenceI fromSeq : fromSequences)
639 * Ignore gaps (unmapped anyway)
641 if (fromSeq.getCharAt(col) == fromGapChar)
647 * Get the residue position and find the mapped position.
649 int residuePos = fromSeq.findPosition(col);
650 SearchResultsI sr = buildSearchResults(fromSeq, residuePos, mappings);
651 for (SearchResultMatchI m : sr.getResults())
653 int mappedStartResidue = m.getStart();
654 int mappedEndResidue = m.getEnd();
655 SequenceI mappedSeq = m.getSequence();
658 * Locate the aligned sequence whose dataset is mappedSeq. TODO a
659 * datamodel that can do this efficiently.
661 for (SequenceI toSeq : toSequences)
663 if (toSeq.getDatasetSequence() == mappedSeq
664 && mappedStartResidue >= toSeq.getStart()
665 && mappedEndResidue <= toSeq.getEnd())
667 int mappedStartCol = toSeq.findIndex(mappedStartResidue);
668 int mappedEndCol = toSeq.findIndex(mappedEndResidue);
669 mappedTo[0] = Math.min(mappedTo[0], mappedStartCol);
670 mappedTo[1] = Math.max(mappedTo[1], mappedEndCol);
673 // note: remove break if we ever want to map one to many sequences
678 return found ? mappedTo : null;
682 * Returns the mapped codon or codons for a given aligned sequence column
686 * an aligned peptide sequence
688 * an aligned column position (base 0)
690 * a set of codon mappings
691 * @return the bases of the mapped codon(s) in the cDNA dataset sequence(s),
692 * or an empty list if none found
694 public static List<char[]> findCodonsFor(SequenceI seq, int col,
695 List<AlignedCodonFrame> mappings)
697 List<char[]> result = new ArrayList<>();
698 int dsPos = seq.findPosition(col);
699 for (AlignedCodonFrame mapping : mappings)
701 if (mapping.involvesSequence(seq))
703 List<char[]> codons = mapping
704 .getMappedCodons(seq.getDatasetSequence(), dsPos);
707 result.addAll(codons);
715 * Converts a series of [start, end] range pairs into an array of individual
716 * positions. This also caters for 'reverse strand' (start > end) cases.
721 public static int[] flattenRanges(int[] ranges)
724 * Count how many positions altogether
727 for (int i = 0; i < ranges.length - 1; i += 2)
729 count += Math.abs(ranges[i + 1] - ranges[i]) + 1;
732 int[] result = new int[count];
734 for (int i = 0; i < ranges.length - 1; i += 2)
736 int from = ranges[i];
737 final int to = ranges[i + 1];
738 int step = from <= to ? 1 : -1;
743 } while (from != to + step);
749 * Returns a list of any mappings that are from or to the given (aligned or
756 public static List<AlignedCodonFrame> findMappingsForSequence(
757 SequenceI sequence, List<AlignedCodonFrame> mappings)
759 return findMappingsForSequenceAndOthers(sequence, mappings, null);
763 * Returns a list of any mappings that are from or to the given (aligned or
764 * dataset) sequence, optionally limited to mappings involving one of a given
772 public static List<AlignedCodonFrame> findMappingsForSequenceAndOthers(
773 SequenceI sequence, List<AlignedCodonFrame> mappings,
774 List<SequenceI> filterList)
776 List<AlignedCodonFrame> result = new ArrayList<>();
777 if (sequence == null || mappings == null)
781 for (AlignedCodonFrame mapping : mappings)
783 if (mapping.involvesSequence(sequence))
785 if (filterList != null)
787 for (SequenceI otherseq : filterList)
789 SequenceI otherDataset = otherseq.getDatasetSequence();
790 if (otherseq == sequence
791 || otherseq == sequence.getDatasetSequence()
792 || (otherDataset != null && (otherDataset == sequence
793 || otherDataset == sequence
794 .getDatasetSequence())))
796 // skip sequences in subset which directly relate to sequence
799 if (mapping.involvesSequence(otherseq))
801 // selected a mapping contained in subselect alignment
817 * Returns the total length of the supplied ranges, which may be as single
818 * [start, end] or multiple [start, end, start, end ...]
823 public static int getLength(List<int[]> ranges)
830 for (int[] range : ranges)
832 if (range.length % 2 != 0)
835 "Error unbalance start/end ranges: " + ranges.toString());
838 for (int i = 0; i < range.length - 1; i += 2)
840 length += Math.abs(range[i + 1] - range[i]) + 1;
847 * Answers true if any range includes the given value
853 public static boolean contains(List<int[]> ranges, int value)
859 for (int[] range : ranges)
861 if (range[1] >= range[0] && value >= range[0] && value <= range[1])
864 * value within ascending range
868 if (range[1] < range[0] && value <= range[0] && value >= range[1])
871 * value within descending range
880 * Removes a specified number of positions from the start of a ranges list.
881 * For example, could be used to adjust cds ranges to allow for an incomplete
882 * start codon. Subranges are removed completely, or their start positions
883 * adjusted, until the required number of positions has been removed from the
884 * range. Reverse strand ranges are supported. The input array is not
889 * an array of [start, end, start, end...] positions
890 * @return a new array with the first removeCount positions removed
892 public static int[] removeStartPositions(int removeCount,
895 if (removeCount <= 0)
900 int[] copy = Arrays.copyOf(ranges, ranges.length);
903 for (int x = 0; x < copy.length && sxpos == -1; x += 2)
905 cdspos += Math.abs(copy[x + 1] - copy[x]) + 1;
906 if (removeCount < cdspos)
909 * we have removed enough, time to finish
914 * increment start of first exon, or decrement if reverse strand
916 if (copy[x] <= copy[x + 1])
918 copy[x] = copy[x + 1] - cdspos + removeCount + 1;
922 copy[x] = copy[x + 1] + cdspos - removeCount - 1;
931 * we dropped at least one entire sub-range - compact the array
933 int[] nxon = new int[copy.length - sxpos];
934 System.arraycopy(copy, sxpos, nxon, 0, copy.length - sxpos);
941 * Answers true if range's start-end positions include those of queryRange,
942 * where either range might be in reverse direction, else false
947 * a candidate subrange of range (start2-end2)
950 public static boolean rangeContains(int[] range, int[] queryRange)
952 if (range == null || queryRange == null || range.length != 2
953 || queryRange.length != 2)
961 int min = Math.min(range[0], range[1]);
962 int max = Math.max(range[0], range[1]);
964 return (min <= queryRange[0] && max >= queryRange[0]
965 && min <= queryRange[1] && max >= queryRange[1]);
969 * Removes the specified number of positions from the given ranges. Provided
970 * to allow a stop codon to be stripped from a CDS sequence so that it matches
971 * the peptide translation length.
975 * a list of (single) [start, end] ranges
978 public static void removeEndPositions(int positions, List<int[]> ranges)
980 int toRemove = positions;
981 Iterator<int[]> it = new ReverseListIterator<>(ranges);
984 int[] endRange = it.next();
985 if (endRange.length != 2)
988 * not coded for [start1, end1, start2, end2, ...]
991 "MappingUtils.removeEndPositions doesn't handle multiple ranges");
995 int length = endRange[1] - endRange[0] + 1;
999 * not coded for a reverse strand range (end < start)
1002 "MappingUtils.removeEndPositions doesn't handle reverse strand");
1005 if (length > toRemove)
1007 endRange[1] -= toRemove;
1019 * Converts a list of [start, end] ranges to a single array of [start, end,
1025 public static int[] listToArray(List<int[]> ranges)
1027 int[] result = new int[ranges.size() * 2];
1029 for (int[] range : ranges)
1031 result[i++] = range[0];
1032 result[i++] = range[1];