+
+ int dnaStart = dnaSeq.getStart();
+ int[] lastCodon = null;
+ int lastPeptidePostion = 0;
+
+ /*
+ * build a map of codon variations for peptides
+ */
+ for (SequenceFeature sf : dnaFeatures)
+ {
+ int dnaCol = sf.getBegin();
+ if (dnaCol != sf.getEnd())
+ {
+ // not handling multi-locus variant features
+ continue;
+ }
+ if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT))
+ {
+ int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
+ if (mapsTo == null)
+ {
+ // feature doesn't lie within coding region
+ continue;
+ }
+ int peptidePosition = mapsTo[0];
+ List<DnaVariant>[] codonVariants = variants.get(peptidePosition);
+ if (codonVariants == null)
+ {
+ codonVariants = new ArrayList[3];
+ codonVariants[0] = new ArrayList<DnaVariant>();
+ codonVariants[1] = new ArrayList<DnaVariant>();
+ codonVariants[2] = new ArrayList<DnaVariant>();
+ variants.put(peptidePosition, codonVariants);
+ }
+
+ /*
+ * extract dna variants to a string array
+ */
+ String alls = (String) sf.getValue("alleles");
+ if (alls == null)
+ {
+ continue;
+ }
+ String[] alleles = alls.toUpperCase().split(",");
+ int i = 0;
+ for (String allele : alleles)
+ {
+ alleles[i++] = allele.trim(); // lose any space characters "A, G"
+ }
+
+ /*
+ * get this peptide's codon positions e.g. [3, 4, 5] or [4, 7, 10]
+ */
+ int[] codon = peptidePosition == lastPeptidePostion ? lastCodon
+ : MappingUtils.flattenRanges(dnaToProtein.locateInFrom(
+ peptidePosition, peptidePosition));
+ lastPeptidePostion = peptidePosition;
+ lastCodon = codon;
+
+ /*
+ * save nucleotide (and any variant) for each codon position
+ */
+ for (int codonPos = 0; codonPos < 3; codonPos++)
+ {
+ String nucleotide = String.valueOf(
+ dnaSeq.getCharAt(codon[codonPos] - dnaStart))
+ .toUpperCase();
+ List<DnaVariant> codonVariant = codonVariants[codonPos];
+ if (codon[codonPos] == dnaCol)
+ {
+ if (!codonVariant.isEmpty()
+ && codonVariant.get(0).variant == null)
+ {
+ /*
+ * already recorded base value, add this variant
+ */
+ codonVariant.get(0).variant = sf;
+ }
+ else
+ {
+ /*
+ * add variant with base value
+ */
+ codonVariant.add(new DnaVariant(nucleotide, sf));
+ }
+ }
+ else if (codonVariant.isEmpty())
+ {
+ /*
+ * record (possibly non-varying) base value
+ */
+ codonVariant.add(new DnaVariant(nucleotide));
+ }
+ }
+ }
+ }
+ return variants;
+ }
+
+ /**
+ * Makes an alignment with a copy of the given sequences, adding in any
+ * non-redundant sequences which are mapped to by the cross-referenced
+ * sequences.
+ *
+ * @param seqs
+ * @param xrefs
+ * @return
+ */
+ public static AlignmentI makeCopyAlignment(SequenceI[] seqs,
+ SequenceI[] xrefs)
+ {
+ AlignmentI copy = new Alignment(new Alignment(seqs));
+
+ /*
+ * add mappings between sequences to the new alignment
+ */
+ AlignedCodonFrame mappings = new AlignedCodonFrame();
+ copy.addCodonFrame(mappings);
+ for (int i = 0; i < copy.getHeight(); i++)
+ {
+ SequenceI from = seqs[i];
+ SequenceI to = copy.getSequenceAt(i);
+ if (to.getDatasetSequence() != null)
+ {
+ to = to.getDatasetSequence();
+ }
+ int start = from.getStart();
+ int end = from.getEnd();
+ MapList map = new MapList(new int[] { start, end }, new int[] {
+ start, end }, 1, 1);
+ mappings.addMap(to, from, map);
+ }
+
+ SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
+ if (xrefs != null)
+ {
+ for (SequenceI xref : xrefs)
+ {
+ DBRefEntry[] dbrefs = xref.getDBRefs();
+ if (dbrefs != null)
+ {
+ for (DBRefEntry dbref : dbrefs)
+ {
+ if (dbref.getMap() == null || dbref.getMap().getTo() == null)
+ {
+ continue;
+ }
+ SequenceI mappedTo = dbref.getMap().getTo();
+ SequenceI match = matcher.findIdMatch(mappedTo);
+ if (match == null)
+ {
+ matcher.add(mappedTo);
+ copy.addSequence(mappedTo);
+ }
+ }
+ }
+ }
+ }
+ return copy;
+ }
+
+ /**
+ * Try to align sequences in 'unaligned' to match the alignment of their
+ * mapped regions in 'aligned'. For example, could use this to align CDS
+ * sequences which are mapped to their parent cDNA sequences.
+ *
+ * This method handles 1:1 mappings (dna-to-dna or protein-to-protein). For
+ * dna-to-protein or protein-to-dna use alternative methods.
+ *
+ * @param unaligned
+ * sequences to be aligned
+ * @param aligned
+ * holds aligned sequences and their mappings
+ * @return
+ */
+ public static int alignAs(AlignmentI unaligned, AlignmentI aligned)
+ {
+ List<SequenceI> unmapped = new ArrayList<SequenceI>();
+ Map<Integer, Map<SequenceI, Character>> columnMap = buildMappedColumnsMap(
+ unaligned, aligned, unmapped);
+ int width = columnMap.size();
+ char gap = unaligned.getGapCharacter();
+ int realignedCount = 0;
+
+ for (SequenceI seq : unaligned.getSequences())
+ {
+ if (!unmapped.contains(seq))
+ {
+ char[] newSeq = new char[width];
+ Arrays.fill(newSeq, gap);
+ int newCol = 0;
+ int lastCol = 0;
+
+ /*
+ * traverse the map to find columns populated
+ * by our sequence
+ */
+ for (Integer column : columnMap.keySet())
+ {
+ Character c = columnMap.get(column).get(seq);
+ if (c != null)
+ {
+ /*
+ * sequence has a character at this position
+ *
+ */
+ newSeq[newCol] = c;
+ lastCol = newCol;
+ }
+ newCol++;
+ }
+
+ /*
+ * trim trailing gaps
+ */
+ if (lastCol < width)
+ {
+ char[] tmp = new char[lastCol + 1];
+ System.arraycopy(newSeq, 0, tmp, 0, lastCol + 1);
+ newSeq = tmp;
+ }
+ seq.setSequence(String.valueOf(newSeq));
+ realignedCount++;
+ }
+ }
+ return realignedCount;
+ }
+
+ /**
+ * Returns a map whose key is alignment column number (base 1), and whose
+ * values are a map of sequence characters in that column.
+ *
+ * @param unaligned
+ * @param aligned
+ * @param unmapped
+ * @return
+ */
+ static Map<Integer, Map<SequenceI, Character>> buildMappedColumnsMap(
+ AlignmentI unaligned, AlignmentI aligned, List<SequenceI> unmapped)
+ {
+ /*
+ * Map will hold, for each aligned column position, a map of
+ * {unalignedSequence, sequenceCharacter} at that position.
+ * TreeMap keeps the entries in ascending column order.
+ */
+ Map<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
+
+ /*
+ * r any sequences that have no mapping so can't be realigned
+ */
+ unmapped.addAll(unaligned.getSequences());
+
+ List<AlignedCodonFrame> mappings = aligned.getCodonFrames();
+
+ for (SequenceI seq : unaligned.getSequences())
+ {
+ for (AlignedCodonFrame mapping : mappings)
+ {
+ SequenceI fromSeq = mapping.findAlignedSequence(seq, aligned);
+ if (fromSeq != null)
+ {
+ Mapping seqMap = mapping.getMappingBetween(fromSeq, seq);
+ if (addMappedPositions(seq, fromSeq, seqMap, map))
+ {
+ unmapped.remove(seq);
+ }
+ }
+ }
+ }
+ return map;
+ }
+
+ /**
+ * Helper method that adds to a map the mapped column positions of a sequence. <br>
+ * For example if aaTT-Tg-gAAA is mapped to TTTAAA then the map should record
+ * that columns 3,4,6,10,11,12 map to characters T,T,T,A,A,A of the mapped to
+ * sequence.
+ *
+ * @param seq
+ * the sequence whose column positions we are recording
+ * @param fromSeq
+ * a sequence that is mapped to the first sequence
+ * @param seqMap
+ * the mapping from 'fromSeq' to 'seq'
+ * @param map
+ * a map to add the column positions (in fromSeq) of the mapped
+ * positions of seq
+ * @return
+ */
+ static boolean addMappedPositions(SequenceI seq, SequenceI fromSeq,
+ Mapping seqMap, Map<Integer, Map<SequenceI, Character>> map)
+ {
+ if (seqMap == null)
+ {
+ return false;
+ }
+
+ char[] fromChars = fromSeq.getSequence();
+ int toStart = seq.getStart();
+ char[] toChars = seq.getSequence();
+
+ /*
+ * traverse [start, end, start, end...] ranges in fromSeq
+ */
+ for (int[] fromRange : seqMap.getMap().getFromRanges())
+ {
+ for (int i = 0; i < fromRange.length - 1; i += 2)
+ {
+ boolean forward = fromRange[i + 1] >= fromRange[i];
+
+ /*
+ * find the range mapped to (sequence positions base 1)
+ */
+ int[] range = seqMap.locateMappedRange(fromRange[i],
+ fromRange[i + 1]);
+ if (range == null)
+ {
+ System.err.println("Error in mapping " + seqMap + " from "
+ + fromSeq.getName());
+ return false;
+ }
+ int fromCol = fromSeq.findIndex(fromRange[i]);
+ int mappedCharPos = range[0];
+
+ /*
+ * walk over the 'from' aligned sequence in forward or reverse
+ * direction; when a non-gap is found, record the column position
+ * of the next character of the mapped-to sequence; stop when all
+ * the characters of the range have been counted
+ */
+ while (mappedCharPos <= range[1])
+ {
+ if (!Comparison.isGap(fromChars[fromCol - 1]))
+ {
+ /*
+ * mapped from sequence has a character in this column
+ * record the column position for the mapped to character
+ */
+ Map<SequenceI, Character> seqsMap = map.get(fromCol);
+ if (seqsMap == null)
+ {
+ seqsMap = new HashMap<SequenceI, Character>();
+ map.put(fromCol, seqsMap);
+ }
+ seqsMap.put(seq, toChars[mappedCharPos - toStart]);
+ mappedCharPos++;
+ }
+ fromCol += (forward ? 1 : -1);
+ }
+ }
+ }
+ return true;
+ }
+
+ // strictly temporary hack until proper criteria for aligning protein to cds
+ // are in place; this is so Ensembl -> fetch xrefs Uniprot aligns the Uniprot
+ public static boolean looksLikeEnsembl(AlignmentI alignment)
+ {
+ for (SequenceI seq : alignment.getSequences())
+ {
+ String name = seq.getName();
+ if (!name.startsWith("ENSG") && !name.startsWith("ENST"))
+ {
+ return false;
+ }
+ }
+ return true;