+
+ /*
+ * fancy case - aligning via mappings between sequences
+ */
+ List<SequenceI> unmapped = new ArrayList<SequenceI>();
+ Map<Integer, Map<SequenceI, Character>> columnMap = buildMappedColumnsMap(
+ unaligned, aligned, unmapped);
+ int width = columnMap.size();
+ char gap = unaligned.getGapCharacter();
+ int realignedCount = 0;
+ // TODO: verify this loop scales sensibly for very wide/high alignments
+
+ for (SequenceI seq : unaligned.getSequences())
+ {
+ if (!unmapped.contains(seq))
+ {
+ char[] newSeq = new char[width];
+ Arrays.fill(newSeq, gap); // JBPComment - doubt this is faster than the
+ // Integer iteration below
+ int newCol = 0;
+ int lastCol = 0;
+
+ /*
+ * traverse the map to find columns populated
+ * by our sequence
+ */
+ for (Integer column : columnMap.keySet())
+ {
+ Character c = columnMap.get(column).get(seq);
+ if (c != null)
+ {
+ /*
+ * sequence has a character at this position
+ *
+ */
+ newSeq[newCol] = c;
+ lastCol = newCol;
+ }
+ newCol++;
+ }
+
+ /*
+ * trim trailing gaps
+ */
+ if (lastCol < width)
+ {
+ char[] tmp = new char[lastCol + 1];
+ System.arraycopy(newSeq, 0, tmp, 0, lastCol + 1);
+ newSeq = tmp;
+ }
+ // TODO: optimise SequenceI to avoid char[]->String->char[]
+ seq.setSequence(String.valueOf(newSeq));
+ realignedCount++;
+ }
+ }
+ return realignedCount;
+ }
+
+ /**
+ * If unaligned and aligned sequences share the same dataset sequences, then
+ * simply copies the aligned sequences to the unaligned sequences and returns
+ * true; else returns false
+ *
+ * @param unaligned
+ * - sequences to be aligned based on aligned
+ * @param aligned
+ * - 'guide' alignment containing sequences derived from same dataset
+ * as unaligned
+ * @return
+ */
+ static boolean alignAsSameSequences(AlignmentI unaligned,
+ AlignmentI aligned)
+ {
+ if (aligned.getDataset() == null || unaligned.getDataset() == null)
+ {
+ return false; // should only pass alignments with datasets here
+ }
+
+ // map from dataset sequence to alignment sequence(s)
+ Map<SequenceI, List<SequenceI>> alignedDatasets = new HashMap<SequenceI, List<SequenceI>>();
+ for (SequenceI seq : aligned.getSequences())
+ {
+ SequenceI ds = seq.getDatasetSequence();
+ if (alignedDatasets.get(ds) == null)
+ {
+ alignedDatasets.put(ds, new ArrayList<SequenceI>());
+ }
+ alignedDatasets.get(ds).add(seq);
+ }
+
+ /*
+ * first pass - check whether all sequences to be aligned share a dataset
+ * sequence with an aligned sequence
+ */
+ for (SequenceI seq : unaligned.getSequences())
+ {
+ if (!alignedDatasets.containsKey(seq.getDatasetSequence()))
+ {
+ return false;
+ }
+ }
+
+ /*
+ * second pass - copy aligned sequences;
+ * heuristic rule: pair off sequences in order for the case where
+ * more than one shares the same dataset sequence
+ */
+ for (SequenceI seq : unaligned.getSequences())
+ {
+ List<SequenceI> alignedSequences = alignedDatasets.get(seq
+ .getDatasetSequence());
+ // TODO: getSequenceAsString() will be deprecated in the future
+ // TODO: need to leave to SequenceI implementor to update gaps
+ seq.setSequence(alignedSequences.get(0).getSequenceAsString());
+ if (alignedSequences.size() > 0)
+ {
+ // pop off aligned sequences (except the last one)
+ alignedSequences.remove(0);
+ }
+ }
+
+ return true;
+ }
+
+ /**
+ * Returns a map whose key is alignment column number (base 1), and whose
+ * values are a map of sequence characters in that column.
+ *
+ * @param unaligned
+ * @param aligned
+ * @param unmapped
+ * @return
+ */
+ static SortedMap<Integer, Map<SequenceI, Character>> buildMappedColumnsMap(
+ AlignmentI unaligned, AlignmentI aligned, List<SequenceI> unmapped)
+ {
+ /*
+ * Map will hold, for each aligned column position, a map of
+ * {unalignedSequence, characterPerSequence} at that position.
+ * TreeMap keeps the entries in ascending column order.
+ */
+ SortedMap<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
+
+ /*
+ * record any sequences that have no mapping so can't be realigned
+ */
+ unmapped.addAll(unaligned.getSequences());
+
+ List<AlignedCodonFrame> mappings = aligned.getCodonFrames();
+
+ for (SequenceI seq : unaligned.getSequences())
+ {
+ for (AlignedCodonFrame mapping : mappings)
+ {
+ SequenceI fromSeq = mapping.findAlignedSequence(seq, aligned);
+ if (fromSeq != null)
+ {
+ Mapping seqMap = mapping.getMappingBetween(fromSeq, seq);
+ if (addMappedPositions(seq, fromSeq, seqMap, map))
+ {
+ unmapped.remove(seq);
+ }
+ }
+ }
+ }
+ return map;
+ }
+
+ /**
+ * Helper method that adds to a map the mapped column positions of a sequence. <br>
+ * For example if aaTT-Tg-gAAA is mapped to TTTAAA then the map should record
+ * that columns 3,4,6,10,11,12 map to characters T,T,T,A,A,A of the mapped to
+ * sequence.
+ *
+ * @param seq
+ * the sequence whose column positions we are recording
+ * @param fromSeq
+ * a sequence that is mapped to the first sequence
+ * @param seqMap
+ * the mapping from 'fromSeq' to 'seq'
+ * @param map
+ * a map to add the column positions (in fromSeq) of the mapped
+ * positions of seq
+ * @return
+ */
+ static boolean addMappedPositions(SequenceI seq, SequenceI fromSeq,
+ Mapping seqMap, Map<Integer, Map<SequenceI, Character>> map)
+ {
+ if (seqMap == null)
+ {
+ return false;
+ }
+
+ /*
+ * invert mapping if it is from unaligned to aligned sequence
+ */
+ if (seqMap.getTo() == fromSeq.getDatasetSequence())
+ {
+ seqMap = new Mapping(seq.getDatasetSequence(), seqMap.getMap()
+ .getInverse());
+ }
+
+ char[] fromChars = fromSeq.getSequence();
+ int toStart = seq.getStart();
+ char[] toChars = seq.getSequence();
+
+ /*
+ * traverse [start, end, start, end...] ranges in fromSeq
+ */
+ for (int[] fromRange : seqMap.getMap().getFromRanges())
+ {
+ for (int i = 0; i < fromRange.length - 1; i += 2)
+ {
+ boolean forward = fromRange[i + 1] >= fromRange[i];
+
+ /*
+ * find the range mapped to (sequence positions base 1)
+ */
+ int[] range = seqMap.locateMappedRange(fromRange[i],
+ fromRange[i + 1]);
+ if (range == null)
+ {
+ System.err.println("Error in mapping " + seqMap + " from "
+ + fromSeq.getName());
+ return false;
+ }
+ int fromCol = fromSeq.findIndex(fromRange[i]);
+ int mappedCharPos = range[0];
+
+ /*
+ * walk over the 'from' aligned sequence in forward or reverse
+ * direction; when a non-gap is found, record the column position
+ * of the next character of the mapped-to sequence; stop when all
+ * the characters of the range have been counted
+ */
+ while (mappedCharPos <= range[1] && fromCol <= fromChars.length
+ && fromCol >= 0)
+ {
+ if (!Comparison.isGap(fromChars[fromCol - 1]))
+ {
+ /*
+ * mapped from sequence has a character in this column
+ * record the column position for the mapped to character
+ */
+ Map<SequenceI, Character> seqsMap = map.get(fromCol);
+ if (seqsMap == null)
+ {
+ seqsMap = new HashMap<SequenceI, Character>();
+ map.put(fromCol, seqsMap);
+ }
+ seqsMap.put(seq, toChars[mappedCharPos - toStart]);
+ mappedCharPos++;
+ }
+ fromCol += (forward ? 1 : -1);
+ }
+ }
+ }
+ return true;
+ }
+
+ // strictly temporary hack until proper criteria for aligning protein to cds
+ // are in place; this is so Ensembl -> fetch xrefs Uniprot aligns the Uniprot
+ public static boolean looksLikeEnsembl(AlignmentI alignment)
+ {
+ for (SequenceI seq : alignment.getSequences())
+ {
+ String name = seq.getName();
+ if (!name.startsWith("ENSG") && !name.startsWith("ENST"))
+ {
+ return false;
+ }
+ }
+ return true;