X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Futil%2FMappingUtils.java;h=ae4e55d3a31eea6ea64d8f4b742457627f03db1f;hb=c775190fba1fe7430b060d48d5d8cc13902a8f47;hp=4cfb49ea09f72eecf4e59e9c87eec6d9953b4ab1;hpb=be32c14cd8e48fe0a207cd7030cb9cd46f894678;p=jalview.git diff --git a/src/jalview/util/MappingUtils.java b/src/jalview/util/MappingUtils.java index 4cfb49e..ae4e55d 100644 --- a/src/jalview/util/MappingUtils.java +++ b/src/jalview/util/MappingUtils.java @@ -1,3 +1,23 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ package jalview.util; import jalview.analysis.AlignmentSorter; @@ -18,11 +38,11 @@ import jalview.datamodel.SequenceGroup; import jalview.datamodel.SequenceI; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Set; /** * Helper methods for manipulations involving sequence mappings. @@ -48,7 +68,7 @@ public final class MappingUtils */ protected static void mapCutOrPaste(Edit edit, boolean undo, List targetSeqs, EditCommand result, - Set mappings) + List mappings) { Action action = edit.getAction(); if (undo) @@ -72,7 +92,7 @@ public final class MappingUtils */ public static EditCommand mapEditCommand(EditCommand command, boolean undo, final AlignmentI mapTo, char gapChar, - Set mappings) + List mappings) { /* * For now, only support mapping from protein edits to cDna @@ -144,7 +164,7 @@ public final class MappingUtils Map originalSequences, final List targetSeqs, Map targetCopies, char gapChar, - EditCommand result, Set mappings) + EditCommand result, List mappings) { Action action = edit.getAction(); @@ -199,8 +219,8 @@ public final class MappingUtils */ int mappedEditPos = action == Action.DELETE_GAP ? match[0] - mappedCount : match[0]; - Edit e = result.new Edit(action, new SequenceI[] - { targetSeq }, mappedEditPos, mappedCount, gapChar); + Edit e = result.new Edit(action, new SequenceI[] { targetSeq }, + mappedEditPos, mappedCount, gapChar); result.addEdit(e); /* @@ -247,10 +267,25 @@ public final class MappingUtils * @return */ public static SearchResults buildSearchResults(SequenceI seq, int index, - Set seqmappings) + List seqmappings) + { + SearchResults results = new SearchResults(); + addSearchResults(results, seq, index, seqmappings); + return results; + } + + /** + * Adds entries to a SearchResults object describing the mapped region + * corresponding to the specified sequence position. + * + * @param results + * @param seq + * @param index + * @param seqmappings + */ + public static void addSearchResults(SearchResults results, SequenceI seq, + int index, List seqmappings) { - SearchResults results; - results = new SearchResults(); if (index >= seq.getStart() && index <= seq.getEnd()) { for (AlignedCodonFrame acf : seqmappings) @@ -258,7 +293,6 @@ public final class MappingUtils acf.markMappedRegion(seq, index, results); } } - return results; } /** @@ -270,8 +304,8 @@ public final class MappingUtils * @param mapTo * @return */ - public static SequenceGroup mapSequenceGroup(SequenceGroup sg, - AlignViewportI mapFrom, AlignViewportI mapTo) + public static SequenceGroup mapSequenceGroup(final SequenceGroup sg, + final AlignViewportI mapFrom, final AlignViewportI mapTo) { /* * Note the SequenceGroup holds aligned sequences, the mappings hold dataset @@ -279,20 +313,52 @@ public final class MappingUtils */ boolean targetIsNucleotide = mapTo.isNucleotide(); AlignViewportI protein = targetIsNucleotide ? mapFrom : mapTo; - Set codonFrames = protein.getAlignment() + List codonFrames = protein.getAlignment() .getCodonFrames(); - /* - * Copy group name, name colours, but not sequences or sequence colour - * scheme + * Copy group name, colours etc, but not sequences or sequence colour scheme */ SequenceGroup mappedGroup = new SequenceGroup(sg); mappedGroup.cs = mapTo.getGlobalColourScheme(); mappedGroup.clear(); - // TODO set width of mapped group + int minStartCol = -1; + int maxEndCol = -1; + final int selectionStartRes = sg.getStartRes(); + final int selectionEndRes = sg.getEndRes(); for (SequenceI selected : sg.getSequences()) { + /* + * Find the widest range of non-gapped positions in the selection range + */ + int firstUngappedPos = selectionStartRes; + while (firstUngappedPos <= selectionEndRes + && Comparison.isGap(selected.getCharAt(firstUngappedPos))) + { + firstUngappedPos++; + } + + /* + * If this sequence is only gaps in the selected range, skip it + */ + if (firstUngappedPos > selectionEndRes) + { + continue; + } + + int lastUngappedPos = selectionEndRes; + while (lastUngappedPos >= selectionStartRes + && Comparison.isGap(selected.getCharAt(lastUngappedPos))) + { + lastUngappedPos--; + } + + /* + * Find the selected start/end residue positions in sequence + */ + int startResiduePos = selected.findPosition(firstUngappedPos); + int endResiduePos = selected.findPosition(lastUngappedPos); + for (AlignedCodonFrame acf : codonFrames) { SequenceI mappedSequence = targetIsNucleotide ? acf @@ -301,8 +367,39 @@ public final class MappingUtils { for (SequenceI seq : mapTo.getAlignment().getSequences()) { + int mappedStartResidue = 0; + int mappedEndResidue = 0; if (seq.getDatasetSequence() == mappedSequence) { + /* + * Found a sequence mapping. Locate the start/end mapped residues. + */ + List mapping = Arrays.asList(new AlignedCodonFrame[] { acf }); + SearchResults sr = buildSearchResults(selected, + startResiduePos, mapping); + for (Match m : sr.getResults()) + { + mappedStartResidue = m.getStart(); + mappedEndResidue = m.getEnd(); + } + sr = buildSearchResults(selected, endResiduePos, mapping); + for (Match m : sr.getResults()) + { + mappedStartResidue = Math.min(mappedStartResidue, + m.getStart()); + mappedEndResidue = Math.max(mappedEndResidue, m.getEnd()); + } + + /* + * Find the mapped aligned columns, save the range. Note findIndex + * returns a base 1 position, SequenceGroup uses base 0 + */ + int mappedStartCol = seq.findIndex(mappedStartResidue) - 1; + minStartCol = minStartCol == -1 ? mappedStartCol : Math.min( + minStartCol, mappedStartCol); + int mappedEndCol = seq.findIndex(mappedEndResidue) - 1; + maxEndCol = maxEndCol == -1 ? mappedEndCol : Math.max( + maxEndCol, mappedEndCol); mappedGroup.addSequence(seq, false); break; } @@ -310,6 +407,8 @@ public final class MappingUtils } } } + mappedGroup.setStartRes(minStartCol < 0 ? 0 : minStartCol); + mappedGroup.setEndRes(maxEndCol < 0 ? 0 : maxEndCol); return mappedGroup; } @@ -328,23 +427,24 @@ public final class MappingUtils * @return */ public static CommandI mapOrderCommand(OrderCommand command, - boolean undo, AlignmentI mapTo, Set mappings) + boolean undo, AlignmentI mapTo, List mappings) { SequenceI[] sortOrder = command.getSequenceOrder(undo); List mappedOrder = new ArrayList(); int j = 0; + + /* + * Assumption: we are only interested in a cDNA/protein mapping; refactor in + * future if we want to support sorting (c)dna as (c)dna or protein as + * protein + */ + boolean mappingToNucleotide = mapTo.isNucleotide(); for (SequenceI seq : sortOrder) { for (AlignedCodonFrame acf : mappings) { - /* - * Try protein-to-Dna, failing that try dna-to-protein - */ - SequenceI mappedSeq = acf.getDnaForAaSeq(seq); - if (mappedSeq == null) - { - mappedSeq = acf.getAaForDnaSeq(seq); - } + SequenceI mappedSeq = mappingToNucleotide ? acf.getDnaForAaSeq(seq) + : acf.getAaForDnaSeq(seq); if (mappedSeq != null) { for (SequenceI seq2 : mapTo.getSequences()) @@ -411,84 +511,174 @@ public final class MappingUtils { boolean targetIsNucleotide = mapTo.isNucleotide(); AlignViewportI protein = targetIsNucleotide ? mapFrom : mapTo; - Set codonFrames = protein.getAlignment() + List codonFrames = protein.getAlignment() .getCodonFrames(); ColumnSelection mappedColumns = new ColumnSelection(); - char fromGapChar = mapFrom.getAlignment().getGapCharacter(); - // FIXME allow for hidden columns + if (colsel == null) + { + return mappedColumns; + } + + char fromGapChar = mapFrom.getAlignment().getGapCharacter(); /* * For each mapped column, find the range of columns that residues in that * column map to. */ - for (Object obj : colsel.getSelected()) + List fromSequences = mapFrom.getAlignment().getSequences(); + List toSequences = mapTo.getAlignment().getSequences(); + + for (Integer sel : colsel.getSelected()) { - int col = ((Integer) obj).intValue(); - int mappedToMin = Integer.MAX_VALUE; - int mappedToMax = Integer.MIN_VALUE; + mapColumn(sel.intValue(), codonFrames, mappedColumns, fromSequences, + toSequences, fromGapChar); + } + + for (int[] hidden : colsel.getHiddenColumns()) + { + mapHiddenColumns(hidden, codonFrames, mappedColumns, fromSequences, + toSequences, fromGapChar); + } + return mappedColumns; + } + + /** + * Helper method that maps a [start, end] hidden column range to its mapped + * equivalent + * + * @param hidden + * @param mappings + * @param mappedColumns + * @param fromSequences + * @param toSequences + * @param fromGapChar + */ + protected static void mapHiddenColumns(int[] hidden, + List mappings, + ColumnSelection mappedColumns, List fromSequences, + List toSequences, char fromGapChar) + { + for (int col = hidden[0]; col <= hidden[1]; col++) + { + int[] mappedTo = findMappedColumns(col, mappings, fromSequences, + toSequences, fromGapChar); /* - * For each sequence in the 'from' alignment + * Add the range of hidden columns to the mapped selection (converting + * base 1 to base 0). */ - for (SequenceI fromSeq : mapFrom.getAlignment().getSequences()) + if (mappedTo != null) { - /* - * Ignore gaps (unmapped anyway) - */ - if (fromSeq.getCharAt(col) == fromGapChar) - { - continue; - } + mappedColumns.hideColumns(mappedTo[0] - 1, mappedTo[1] - 1); + } + } + } + + /** + * Helper method to map one column selection + * + * @param col + * the column number (base 0) + * @param mappings + * the sequence mappings + * @param mappedColumns + * the mapped column selections to add to + * @param fromSequences + * @param toSequences + * @param fromGapChar + */ + protected static void mapColumn(int col, + List mappings, + ColumnSelection mappedColumns, List fromSequences, + List toSequences, char fromGapChar) + { + int[] mappedTo = findMappedColumns(col, mappings, fromSequences, + toSequences, fromGapChar); + + /* + * Add the range of mapped columns to the mapped selection (converting + * base 1 to base 0). Note that this may include intron-only regions which + * lie between the start and end ranges of the selection. + */ + if (mappedTo != null) + { + for (int i = mappedTo[0]; i <= mappedTo[1]; i++) + { + mappedColumns.addElement(i - 1); + } + } + } + + /** + * Helper method to find the range of columns mapped to from one column. + * Returns the maximal range of columns mapped to from all sequences in the + * source column, or null if no mappings were found. + * + * @param col + * @param mappings + * @param fromSequences + * @param toSequences + * @param fromGapChar + * @return + */ + protected static int[] findMappedColumns(int col, + List mappings, List fromSequences, + List toSequences, char fromGapChar) + { + int[] mappedTo = new int[] { Integer.MAX_VALUE, Integer.MIN_VALUE }; + boolean found = false; + + /* + * For each sequence in the 'from' alignment + */ + for (SequenceI fromSeq : fromSequences) + { + /* + * Ignore gaps (unmapped anyway) + */ + if (fromSeq.getCharAt(col) == fromGapChar) + { + continue; + } + + /* + * Get the residue position and find the mapped position. + */ + int residuePos = fromSeq.findPosition(col); + SearchResults sr = buildSearchResults(fromSeq, residuePos, + mappings); + for (Match m : sr.getResults()) + { + int mappedStartResidue = m.getStart(); + int mappedEndResidue = m.getEnd(); + SequenceI mappedSeq = m.getSequence(); /* - * Get the residue position and find the mapped position. + * Locate the aligned sequence whose dataset is mappedSeq. TODO a + * datamodel that can do this efficiently. */ - int residuePos = fromSeq.findPosition(col); - SearchResults sr = buildSearchResults(fromSeq, residuePos, - codonFrames); - for (Match m : sr.getResults()) + for (SequenceI toSeq : toSequences) { - int mappedStartResidue = m.getStart(); - int mappedEndResidue = m.getEnd(); - SequenceI mappedSeq = m.getSequence(); - - /* - * Locate the aligned sequence whose dataset is mappedSeq. TODO a - * datamodel that can do this efficiently. - */ - for (SequenceI toSeq : mapTo.getAlignment().getSequences()) + if (toSeq.getDatasetSequence() == mappedSeq) { - if (toSeq.getDatasetSequence() == mappedSeq) - { - int mappedStartCol = toSeq.findIndex(mappedStartResidue); - int mappedEndCol = toSeq.findIndex(mappedEndResidue); - mappedToMin = Math.min(mappedToMin, mappedStartCol); - mappedToMax = Math.max(mappedToMax, mappedEndCol); - // System.out.println(fromSeq.getName() + " mapped to cols " - // + mappedStartCol + ":" + mappedEndCol); - break; - // note: remove break if we ever want to map one to many sequences - } + int mappedStartCol = toSeq.findIndex(mappedStartResidue); + int mappedEndCol = toSeq.findIndex(mappedEndResidue); + mappedTo[0] = Math.min(mappedTo[0], mappedStartCol); + mappedTo[1] = Math.max(mappedTo[1], mappedEndCol); + found = true; + break; + // note: remove break if we ever want to map one to many sequences } } } - /* - * Add the range of mapped columns to the mapped selection (converting - * base 1 to base 0). Note that this may include intron-only regions which - * lie between the start and end ranges of the selection. - */ - for (int i = mappedToMin; i <= mappedToMax; i++) - { - mappedColumns.addElement(i - 1); - } } - return mappedColumns; + return found ? mappedTo : null; } /** - * Returns the mapped codon for a given aligned sequence column position (base - * 0). + * Returns the mapped codon or codons for a given aligned sequence column + * position (base 0). * * @param seq * an aligned peptide sequence @@ -496,26 +686,32 @@ public final class MappingUtils * an aligned column position (base 0) * @param mappings * a set of codon mappings - * @return the bases of the mapped codon in the cDNA dataset sequence, or null - * if not found + * @return the bases of the mapped codon(s) in the cDNA dataset sequence(s), + * or an empty list if none found */ - public static char[] findCodonFor(SequenceI seq, int col, - Set mappings) + public static List findCodonsFor(SequenceI seq, int col, + List mappings) { + List result = new ArrayList(); int dsPos = seq.findPosition(col); for (AlignedCodonFrame mapping : mappings) { if (mapping.involvesSequence(seq)) { - return mapping.getMappedCodon(seq.getDatasetSequence(), dsPos); + List codons = mapping.getMappedCodons( + seq.getDatasetSequence(), dsPos); + if (codons != null) + { + result.addAll(codons); + } } } - return null; + return result; } /** - * Converts a series of [start, end] ranges into an array of individual - * positions. + * Converts a series of [start, end] range pairs into an array of individual + * positions. This also caters for 'reverse strand' (start > end) cases. * * @param ranges * @return @@ -528,18 +724,172 @@ public final class MappingUtils int count = 0; for (int i = 0; i < ranges.length - 1; i += 2) { - count += ranges[i + 1] - ranges[i] + 1; + count += Math.abs(ranges[i + 1] - ranges[i]) + 1; } int[] result = new int[count]; int k = 0; for (int i = 0; i < ranges.length - 1; i += 2) { - for (int j = ranges[i]; j <= ranges[i + 1]; j++) + int from = ranges[i]; + final int to = ranges[i + 1]; + int step = from <= to ? 1 : -1; + do { - result[k++] = j; + result[k++] = from; + from += step; + } while (from != to + step); + } + return result; + } + + /** + * Returns a list of any mappings that are from or to the given (aligned or + * dataset) sequence. + * + * @param sequence + * @param mappings + * @return + */ + public static List findMappingsForSequence( + SequenceI sequence, List mappings) + { + List result = new ArrayList(); + if (sequence == null || mappings == null) + { + return result; + } + for (AlignedCodonFrame mapping : mappings) + { + if (mapping.involvesSequence(sequence)) + { + result.add(mapping); } } return result; } + + /** + * Returns the total length of the supplied ranges, which may be as single + * [start, end] or multiple [start, end, start, end ...] + * + * @param ranges + * @return + */ + public static int getLength(List ranges) + { + if (ranges == null) + { + return 0; + } + int length = 0; + for (int[] range : ranges) + { + if (range.length % 2 != 0) + { + System.err.println("Error unbalance start/end ranges: " + + ranges.toString()); + return 0; + } + for (int i = 0; i < range.length - 1; i += 2) + { + length += Math.abs(range[i + 1] - range[i]) + 1; + } + } + return length; + } + + /** + * Answers true if any range includes the given value + * + * @param ranges + * @param value + * @return + */ + public static boolean contains(List ranges, int value) + { + if (ranges == null) + { + return false; + } + for (int[] range : ranges) + { + if (range[1] >= range[0] && value >= range[0] && value <= range[1]) + { + /* + * value within ascending range + */ + return true; + } + if (range[1] < range[0] && value <= range[0] && value >= range[1]) + { + /* + * value within descending range + */ + return true; + } + } + return false; + } + + /** + * Removes a specified number of positions from the start of a ranges list. + * For example, could be used to adjust cds ranges to allow for an incomplete + * start codon. Subranges are removed completely, or their start positions + * adjusted, until the required number of positions has been removed from the + * range. Reverse strand ranges are supported. The input array is not + * modified. + * + * @param removeCount + * @param ranges + * an array of [start, end, start, end...] positions + * @return a new array with the first removeCount positions removed + */ + public static int[] removeStartPositions(int removeCount, + final int[] ranges) + { + if (removeCount <= 0) + { + return ranges; + } + + int[] copy = Arrays.copyOf(ranges, ranges.length); + int sxpos = -1; + int cdspos = 0; + for (int x = 0; x < copy.length && sxpos == -1; x += 2) + { + cdspos += Math.abs(copy[x + 1] - copy[x]) + 1; + if (removeCount < cdspos) + { + /* + * we have removed enough, time to finish + */ + sxpos = x; + + /* + * increment start of first exon, or decrement if reverse strand + */ + if (copy[x] <= copy[x + 1]) + { + copy[x] = copy[x + 1] - cdspos + removeCount + 1; + } + else + { + copy[x] = copy[x + 1] + cdspos - removeCount - 1; + } + break; + } + } + + if (sxpos > 0) + { + /* + * we dropped at least one entire sub-range - compact the array + */ + int[] nxon = new int[copy.length - sxpos]; + System.arraycopy(copy, sxpos, nxon, 0, copy.length - sxpos); + return nxon; + } + return copy; + } }