+ /*
+ * map from peptide position to all variants of the codon which codes for it
+ * LinkedHashMap ensures we keep the peptide features in sequence order
+ */
+ LinkedHashMap<Integer, List<DnaVariant>[]> variants = new LinkedHashMap<Integer, List<DnaVariant>[]>();
+ SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+
+ SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures();
+ if (dnaFeatures == null)
+ {
+ return variants;
+ }
+
+ int dnaStart = dnaSeq.getStart();
+ int[] lastCodon = null;
+ int lastPeptidePostion = 0;
+
+ /*
+ * build a map of codon variations for peptides
+ */
+ for (SequenceFeature sf : dnaFeatures)
+ {
+ int dnaCol = sf.getBegin();
+ if (dnaCol != sf.getEnd())
+ {
+ // not handling multi-locus variant features
+ continue;
+ }
+ if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT))
+ {
+ int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
+ if (mapsTo == null)
+ {
+ // feature doesn't lie within coding region
+ continue;
+ }
+ int peptidePosition = mapsTo[0];
+ List<DnaVariant>[] codonVariants = variants.get(peptidePosition);
+ if (codonVariants == null)
+ {
+ codonVariants = new ArrayList[3];
+ codonVariants[0] = new ArrayList<DnaVariant>();
+ codonVariants[1] = new ArrayList<DnaVariant>();
+ codonVariants[2] = new ArrayList<DnaVariant>();
+ variants.put(peptidePosition, codonVariants);
+ }
+
+ /*
+ * extract dna variants to a string array
+ */
+ String alls = (String) sf.getValue("alleles");
+ if (alls == null)
+ {
+ continue;
+ }
+ String[] alleles = alls.toUpperCase().split(",");
+ int i = 0;
+ for (String allele : alleles)
+ {
+ alleles[i++] = allele.trim(); // lose any space characters "A, G"
+ }
+
+ /*
+ * get this peptide's codon positions e.g. [3, 4, 5] or [4, 7, 10]
+ */
+ int[] codon = peptidePosition == lastPeptidePostion ? lastCodon
+ : MappingUtils.flattenRanges(dnaToProtein.locateInFrom(
+ peptidePosition, peptidePosition));
+ lastPeptidePostion = peptidePosition;
+ lastCodon = codon;
+
+ /*
+ * save nucleotide (and any variant) for each codon position
+ */
+ for (int codonPos = 0; codonPos < 3; codonPos++)
+ {
+ String nucleotide = String.valueOf(
+ dnaSeq.getCharAt(codon[codonPos] - dnaStart))
+ .toUpperCase();
+ List<DnaVariant> codonVariant = codonVariants[codonPos];
+ if (codon[codonPos] == dnaCol)
+ {
+ if (!codonVariant.isEmpty()
+ && codonVariant.get(0).variant == null)
+ {
+ /*
+ * already recorded base value, add this variant
+ */
+ codonVariant.get(0).variant = sf;
+ }
+ else
+ {
+ /*
+ * add variant with base value
+ */
+ codonVariant.add(new DnaVariant(nucleotide, sf));
+ }
+ }
+ else if (codonVariant.isEmpty())
+ {
+ /*
+ * record (possibly non-varying) base value
+ */
+ codonVariant.add(new DnaVariant(nucleotide));
+ }
+ }
+ }
+ }
+ return variants;
+ }
+
+ /**
+ * Makes an alignment with a copy of the given sequences, adding in any
+ * non-redundant sequences which are mapped to by the cross-referenced
+ * sequences.
+ *
+ * @param seqs
+ * @param xrefs
+ * @param dataset
+ * the alignment dataset shared by the new copy
+ * @return
+ */
+ public static AlignmentI makeCopyAlignment(SequenceI[] seqs,
+ SequenceI[] xrefs, AlignmentI dataset)
+ {
+ AlignmentI copy = new Alignment(new Alignment(seqs));
+ copy.setDataset(dataset);
+
+ SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
+ if (xrefs != null)
+ {
+ for (SequenceI xref : xrefs)
+ {
+ DBRefEntry[] dbrefs = xref.getDBRefs();
+ if (dbrefs != null)
+ {
+ for (DBRefEntry dbref : dbrefs)
+ {
+ if (dbref.getMap() == null || dbref.getMap().getTo() == null)
+ {
+ continue;
+ }
+ SequenceI mappedTo = dbref.getMap().getTo();
+ SequenceI match = matcher.findIdMatch(mappedTo);
+ if (match == null)
+ {
+ matcher.add(mappedTo);
+ copy.addSequence(mappedTo);
+ }
+ }
+ }
+ }
+ }
+ return copy;
+ }
+
+ /**
+ * Try to align sequences in 'unaligned' to match the alignment of their
+ * mapped regions in 'aligned'. For example, could use this to align CDS
+ * sequences which are mapped to their parent cDNA sequences.
+ *
+ * This method handles 1:1 mappings (dna-to-dna or protein-to-protein). For
+ * dna-to-protein or protein-to-dna use alternative methods.
+ *
+ * @param unaligned
+ * sequences to be aligned
+ * @param aligned
+ * holds aligned sequences and their mappings
+ * @return
+ */
+ public static int alignAs(AlignmentI unaligned, AlignmentI aligned)
+ {
+ /*
+ * easy case - aligning a copy of aligned sequences
+ */
+ if (alignAsSameSequences(unaligned, aligned))
+ {
+ return unaligned.getHeight();
+ }
+
+ /*
+ * fancy case - aligning via mappings between sequences
+ */
+ List<SequenceI> unmapped = new ArrayList<SequenceI>();
+ Map<Integer, Map<SequenceI, Character>> columnMap = buildMappedColumnsMap(
+ unaligned, aligned, unmapped);
+ int width = columnMap.size();
+ char gap = unaligned.getGapCharacter();
+ int realignedCount = 0;
+
+ for (SequenceI seq : unaligned.getSequences())
+ {
+ if (!unmapped.contains(seq))
+ {
+ char[] newSeq = new char[width];
+ Arrays.fill(newSeq, gap);
+ int newCol = 0;
+ int lastCol = 0;
+
+ /*
+ * traverse the map to find columns populated
+ * by our sequence
+ */
+ for (Integer column : columnMap.keySet())
+ {
+ Character c = columnMap.get(column).get(seq);
+ if (c != null)
+ {
+ /*
+ * sequence has a character at this position
+ *
+ */
+ newSeq[newCol] = c;
+ lastCol = newCol;
+ }
+ newCol++;
+ }
+
+ /*
+ * trim trailing gaps
+ */
+ if (lastCol < width)
+ {
+ char[] tmp = new char[lastCol + 1];
+ System.arraycopy(newSeq, 0, tmp, 0, lastCol + 1);
+ newSeq = tmp;
+ }
+ seq.setSequence(String.valueOf(newSeq));
+ realignedCount++;
+ }
+ }
+ return realignedCount;
+ }
+
+ /**
+ * If unaligned and aligned sequences share the same dataset sequences, then
+ * simply copies the aligned sequences to the unaligned sequences and returns
+ * true; else returns false
+ *
+ * @param unaligned
+ * @param aligned
+ * @return
+ */
+ static boolean alignAsSameSequences(AlignmentI unaligned,
+ AlignmentI aligned)
+ {
+ if (aligned.getDataset() == null || unaligned.getDataset() == null)
+ {
+ return false; // should only pass alignments with datasets here
+ }
+
+ Map<SequenceI, SequenceI> alignedDatasets = new HashMap<SequenceI, SequenceI>();
+ for (SequenceI seq : aligned.getSequences())
+ {
+ alignedDatasets.put(seq.getDatasetSequence(), seq);
+ }
+
+ /*
+ * first pass - check whether all sequences to be aligned share a dataset
+ * sequence with an aligned sequence
+ */
+ for (SequenceI seq : unaligned.getSequences())