2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import jalview.datamodel.AlignedCodon;
24 import jalview.datamodel.AlignedCodonFrame;
25 import jalview.datamodel.Alignment;
26 import jalview.datamodel.AlignmentAnnotation;
27 import jalview.datamodel.AlignmentI;
28 import jalview.datamodel.DBRefEntry;
29 import jalview.datamodel.IncompleteCodonException;
30 import jalview.datamodel.Mapping;
31 import jalview.datamodel.Sequence;
32 import jalview.datamodel.SequenceFeature;
33 import jalview.datamodel.SequenceGroup;
34 import jalview.datamodel.SequenceI;
35 import jalview.io.gff.SequenceOntologyFactory;
36 import jalview.io.gff.SequenceOntologyI;
37 import jalview.schemes.ResidueProperties;
38 import jalview.util.Comparison;
39 import jalview.util.MapList;
40 import jalview.util.MappingUtils;
41 import jalview.util.StringUtils;
43 import java.util.ArrayList;
44 import java.util.Arrays;
45 import java.util.Collection;
46 import java.util.Collections;
47 import java.util.Comparator;
48 import java.util.HashMap;
49 import java.util.HashSet;
50 import java.util.Iterator;
51 import java.util.LinkedHashMap;
52 import java.util.List;
54 import java.util.Map.Entry;
55 import java.util.NoSuchElementException;
57 import java.util.TreeMap;
60 * grab bag of useful alignment manipulation operations Expect these to be
61 * refactored elsewhere at some point.
66 public class AlignmentUtils
70 * given an existing alignment, create a new alignment including all, or up to
71 * flankSize additional symbols from each sequence's dataset sequence
77 public static AlignmentI expandContext(AlignmentI core, int flankSize)
79 List<SequenceI> sq = new ArrayList<SequenceI>();
81 for (SequenceI s : core.getSequences())
83 SequenceI newSeq = s.deriveSequence();
84 final int newSeqStart = newSeq.getStart() - 1;
85 if (newSeqStart > maxoffset
86 && newSeq.getDatasetSequence().getStart() < s.getStart())
88 maxoffset = newSeqStart;
94 maxoffset = Math.min(maxoffset, flankSize);
98 * now add offset left and right to create an expanded alignment
100 for (SequenceI s : sq)
103 while (ds.getDatasetSequence() != null)
105 ds = ds.getDatasetSequence();
107 int s_end = s.findPosition(s.getStart() + s.getLength());
108 // find available flanking residues for sequence
109 int ustream_ds = s.getStart() - ds.getStart();
110 int dstream_ds = ds.getEnd() - s_end;
112 // build new flanked sequence
114 // compute gap padding to start of flanking sequence
115 int offset = maxoffset - ustream_ds;
117 // padding is gapChar x ( maxoffset - min(ustream_ds, flank)
120 if (flankSize < ustream_ds)
122 // take up to flankSize residues
123 offset = maxoffset - flankSize;
124 ustream_ds = flankSize;
126 if (flankSize <= dstream_ds)
128 dstream_ds = flankSize - 1;
131 // TODO use Character.toLowerCase to avoid creating String objects?
132 char[] upstream = new String(ds.getSequence(s.getStart() - 1
133 - ustream_ds, s.getStart() - 1)).toLowerCase().toCharArray();
134 char[] downstream = new String(ds.getSequence(s_end - 1, s_end
135 + dstream_ds)).toLowerCase().toCharArray();
136 char[] coreseq = s.getSequence();
137 char[] nseq = new char[offset + upstream.length + downstream.length
139 char c = core.getGapCharacter();
142 for (; p < offset; p++)
147 System.arraycopy(upstream, 0, nseq, p, upstream.length);
148 System.arraycopy(coreseq, 0, nseq, p + upstream.length,
150 System.arraycopy(downstream, 0, nseq, p + coreseq.length
151 + upstream.length, downstream.length);
152 s.setSequence(new String(nseq));
153 s.setStart(s.getStart() - ustream_ds);
154 s.setEnd(s_end + downstream.length);
156 AlignmentI newAl = new jalview.datamodel.Alignment(
157 sq.toArray(new SequenceI[0]));
158 for (SequenceI s : sq)
160 if (s.getAnnotation() != null)
162 for (AlignmentAnnotation aa : s.getAnnotation())
164 aa.adjustForAlignment(); // JAL-1712 fix
165 newAl.addAnnotation(aa);
169 newAl.setDataset(core.getDataset());
174 * Returns the index (zero-based position) of a sequence in an alignment, or
181 public static int getSequenceIndex(AlignmentI al, SequenceI seq)
185 for (SequenceI alSeq : al.getSequences())
198 * Returns a map of lists of sequences in the alignment, keyed by sequence
199 * name. For use in mapping between different alignment views of the same
202 * @see jalview.datamodel.AlignmentI#getSequencesByName()
204 public static Map<String, List<SequenceI>> getSequencesByName(
207 Map<String, List<SequenceI>> theMap = new LinkedHashMap<String, List<SequenceI>>();
208 for (SequenceI seq : al.getSequences())
210 String name = seq.getName();
213 List<SequenceI> seqs = theMap.get(name);
216 seqs = new ArrayList<SequenceI>();
217 theMap.put(name, seqs);
226 * Build mapping of protein to cDNA alignment. Mappings are made between
227 * sequences where the cDNA translates to the protein sequence. Any new
228 * mappings are added to the protein alignment. Returns true if any mappings
229 * either already exist or were added, else false.
231 * @param proteinAlignment
232 * @param cdnaAlignment
235 public static boolean mapProteinAlignmentToCdna(
236 final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment)
238 if (proteinAlignment == null || cdnaAlignment == null)
243 Set<SequenceI> mappedDna = new HashSet<SequenceI>();
244 Set<SequenceI> mappedProtein = new HashSet<SequenceI>();
247 * First pass - map sequences where cross-references exist. This include
248 * 1-to-many mappings to support, for example, variant cDNA.
250 boolean mappingPerformed = mapProteinToCdna(proteinAlignment,
251 cdnaAlignment, mappedDna, mappedProtein, true);
254 * Second pass - map sequences where no cross-references exist. This only
255 * does 1-to-1 mappings and assumes corresponding sequences are in the same
256 * order in the alignments.
258 mappingPerformed |= mapProteinToCdna(proteinAlignment, cdnaAlignment,
259 mappedDna, mappedProtein, false);
260 return mappingPerformed;
264 * Make mappings between compatible sequences (where the cDNA translation
265 * matches the protein).
267 * @param proteinAlignment
268 * @param cdnaAlignment
270 * a set of mapped DNA sequences (to add to)
271 * @param mappedProtein
272 * a set of mapped Protein sequences (to add to)
274 * if true, only map sequences where xrefs exist
277 protected static boolean mapProteinToCdna(
278 final AlignmentI proteinAlignment,
279 final AlignmentI cdnaAlignment, Set<SequenceI> mappedDna,
280 Set<SequenceI> mappedProtein, boolean xrefsOnly)
282 boolean mappingExistsOrAdded = false;
283 List<SequenceI> thisSeqs = proteinAlignment.getSequences();
284 for (SequenceI aaSeq : thisSeqs)
286 boolean proteinMapped = false;
287 AlignedCodonFrame acf = new AlignedCodonFrame();
289 for (SequenceI cdnaSeq : cdnaAlignment.getSequences())
292 * Always try to map if sequences have xref to each other; this supports
293 * variant cDNA or alternative splicing for a protein sequence.
295 * If no xrefs, try to map progressively, assuming that alignments have
296 * mappable sequences in corresponding order. These are not
297 * many-to-many, as that would risk mixing species with similar cDNA
300 if (xrefsOnly && !AlignmentUtils.haveCrossRef(aaSeq, cdnaSeq))
306 * Don't map non-xrefd sequences more than once each. This heuristic
307 * allows us to pair up similar sequences in ordered alignments.
310 && (mappedProtein.contains(aaSeq) || mappedDna
315 if (mappingExists(proteinAlignment.getCodonFrames(),
316 aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence()))
318 mappingExistsOrAdded = true;
322 MapList map = mapCdnaToProtein(aaSeq, cdnaSeq);
325 acf.addMap(cdnaSeq, aaSeq, map);
326 mappingExistsOrAdded = true;
327 proteinMapped = true;
328 mappedDna.add(cdnaSeq);
329 mappedProtein.add(aaSeq);
335 proteinAlignment.addCodonFrame(acf);
338 return mappingExistsOrAdded;
342 * Answers true if the mappings include one between the given (dataset)
345 public static boolean mappingExists(List<AlignedCodonFrame> mappings,
346 SequenceI aaSeq, SequenceI cdnaSeq)
348 if (mappings != null)
350 for (AlignedCodonFrame acf : mappings)
352 if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
362 * Builds a mapping (if possible) of a cDNA to a protein sequence.
364 * <li>first checks if the cdna translates exactly to the protein sequence</li>
365 * <li>else checks for translation after removing a STOP codon</li>
366 * <li>else checks for translation after removing a START codon</li>
367 * <li>if that fails, inspect CDS features on the cDNA sequence</li>
369 * Returns null if no mapping is determined.
372 * the aligned protein sequence
374 * the aligned cdna sequence
377 public static MapList mapCdnaToProtein(SequenceI proteinSeq,
381 * Here we handle either dataset sequence set (desktop) or absent (applet).
382 * Use only the char[] form of the sequence to avoid creating possibly large
385 final SequenceI proteinDataset = proteinSeq.getDatasetSequence();
386 char[] aaSeqChars = proteinDataset != null ? proteinDataset
387 .getSequence() : proteinSeq.getSequence();
388 final SequenceI cdnaDataset = cdnaSeq.getDatasetSequence();
389 char[] cdnaSeqChars = cdnaDataset != null ? cdnaDataset.getSequence()
390 : cdnaSeq.getSequence();
391 if (aaSeqChars == null || cdnaSeqChars == null)
397 * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping)
399 final int mappedLength = 3 * aaSeqChars.length;
400 int cdnaLength = cdnaSeqChars.length;
401 int cdnaStart = cdnaSeq.getStart();
402 int cdnaEnd = cdnaSeq.getEnd();
403 final int proteinStart = proteinSeq.getStart();
404 final int proteinEnd = proteinSeq.getEnd();
407 * If lengths don't match, try ignoring stop codon (if present)
409 if (cdnaLength != mappedLength && cdnaLength > 2)
411 String lastCodon = String.valueOf(cdnaSeqChars, cdnaLength - 3, 3)
413 for (String stop : ResidueProperties.STOP)
415 if (lastCodon.equals(stop))
425 * If lengths still don't match, try ignoring start codon.
428 if (cdnaLength != mappedLength
430 && String.valueOf(cdnaSeqChars, 0, 3).toUpperCase()
431 .equals(ResidueProperties.START))
438 if (translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
441 * protein is translation of dna (+/- start/stop codons)
443 MapList map = new MapList(new int[] { cdnaStart, cdnaEnd }, new int[]
444 { proteinStart, proteinEnd }, 3, 1);
449 * translation failed - try mapping CDS annotated regions of dna
451 return mapCdsToProtein(cdnaSeq, proteinSeq);
455 * Test whether the given cdna sequence, starting at the given offset,
456 * translates to the given amino acid sequence, using the standard translation
457 * table. Designed to fail fast i.e. as soon as a mismatch position is found.
459 * @param cdnaSeqChars
464 protected static boolean translatesAs(char[] cdnaSeqChars, int cdnaStart,
467 if (cdnaSeqChars == null || aaSeqChars == null)
473 int dnaPos = cdnaStart;
474 for (; dnaPos < cdnaSeqChars.length - 2
475 && aaPos < aaSeqChars.length; dnaPos += 3, aaPos++)
477 String codon = String.valueOf(cdnaSeqChars, dnaPos, 3);
478 final String translated = ResidueProperties.codonTranslate(codon);
481 * allow * in protein to match untranslatable in dna
483 final char aaRes = aaSeqChars[aaPos];
484 if ((translated == null || "STOP".equals(translated)) && aaRes == '*')
488 if (translated == null || !(aaRes == translated.charAt(0)))
491 // System.out.println(("Mismatch at " + i + "/" + aaResidue + ": "
492 // + codon + "(" + translated + ") != " + aaRes));
498 * check we matched all of the protein sequence
500 if (aaPos != aaSeqChars.length)
506 * check we matched all of the dna except
507 * for optional trailing STOP codon
509 if (dnaPos == cdnaSeqChars.length)
513 if (dnaPos == cdnaSeqChars.length - 3)
515 String codon = String.valueOf(cdnaSeqChars, dnaPos, 3);
516 if ("STOP".equals(ResidueProperties.codonTranslate(codon)))
525 * Align sequence 'seq' to match the alignment of a mapped sequence. Note this
526 * currently assumes that we are aligning cDNA to match protein.
529 * the sequence to be realigned
531 * the alignment whose sequence alignment is to be 'copied'
533 * character string represent a gap in the realigned sequence
534 * @param preserveUnmappedGaps
535 * @param preserveMappedGaps
536 * @return true if the sequence was realigned, false if it could not be
538 public static boolean alignSequenceAs(SequenceI seq, AlignmentI al,
539 String gap, boolean preserveMappedGaps,
540 boolean preserveUnmappedGaps)
543 * Get any mappings from the source alignment to the target (dataset)
546 // TODO there may be one AlignedCodonFrame per dataset sequence, or one with
547 // all mappings. Would it help to constrain this?
548 List<AlignedCodonFrame> mappings = al.getCodonFrame(seq);
549 if (mappings == null || mappings.isEmpty())
555 * Locate the aligned source sequence whose dataset sequence is mapped. We
556 * just take the first match here (as we can't align like more than one
559 SequenceI alignFrom = null;
560 AlignedCodonFrame mapping = null;
561 for (AlignedCodonFrame mp : mappings)
563 alignFrom = mp.findAlignedSequence(seq, al);
564 if (alignFrom != null)
571 if (alignFrom == null)
575 alignSequenceAs(seq, alignFrom, mapping, gap, al.getGapCharacter(),
576 preserveMappedGaps, preserveUnmappedGaps);
581 * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to
582 * match residues and codons. Flags control whether existing gaps in unmapped
583 * (intron) and mapped (exon) regions are preserved or not. Gaps between
584 * intron and exon are only retained if both flags are set.
591 * @param preserveUnmappedGaps
592 * @param preserveMappedGaps
594 public static void alignSequenceAs(SequenceI alignTo,
595 SequenceI alignFrom, AlignedCodonFrame mapping, String myGap,
596 char sourceGap, boolean preserveMappedGaps,
597 boolean preserveUnmappedGaps)
599 // TODO generalise to work for Protein-Protein, dna-dna, dna-protein
601 // aligned and dataset sequence positions, all base zero
605 int basesWritten = 0;
606 char myGapChar = myGap.charAt(0);
607 int ratio = myGap.length();
609 int fromOffset = alignFrom.getStart() - 1;
610 int toOffset = alignTo.getStart() - 1;
611 int sourceGapMappedLength = 0;
612 boolean inExon = false;
613 final char[] thisSeq = alignTo.getSequence();
614 final char[] thatAligned = alignFrom.getSequence();
615 StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length);
618 * Traverse the 'model' aligned sequence
620 for (char sourceChar : thatAligned)
622 if (sourceChar == sourceGap)
624 sourceGapMappedLength += ratio;
629 * Found a non-gap character. Locate its mapped region if any.
632 // Note mapping positions are base 1, our sequence positions base 0
633 int[] mappedPos = mapping.getMappedRegion(alignTo, alignFrom,
634 sourceDsPos + fromOffset);
635 if (mappedPos == null)
638 * unmapped position; treat like a gap
640 sourceGapMappedLength += ratio;
641 // System.err.println("Can't align: no codon mapping to residue "
642 // + sourceDsPos + "(" + sourceChar + ")");
647 int mappedCodonStart = mappedPos[0]; // position (1...) of codon start
648 int mappedCodonEnd = mappedPos[mappedPos.length - 1]; // codon end pos
649 StringBuilder trailingCopiedGap = new StringBuilder();
652 * Copy dna sequence up to and including this codon. Optionally, include
653 * gaps before the codon starts (in introns) and/or after the codon starts
656 * Note this only works for 'linear' splicing, not reverse or interleaved.
657 * But then 'align dna as protein' doesn't make much sense otherwise.
659 int intronLength = 0;
660 while (basesWritten + toOffset < mappedCodonEnd
661 && thisSeqPos < thisSeq.length)
663 final char c = thisSeq[thisSeqPos++];
667 int sourcePosition = basesWritten + toOffset;
668 if (sourcePosition < mappedCodonStart)
671 * Found an unmapped (intron) base. First add in any preceding gaps
674 if (preserveUnmappedGaps && trailingCopiedGap.length() > 0)
676 thisAligned.append(trailingCopiedGap.toString());
677 intronLength += trailingCopiedGap.length();
678 trailingCopiedGap = new StringBuilder();
685 final boolean startOfCodon = sourcePosition == mappedCodonStart;
686 int gapsToAdd = calculateGapsToInsert(preserveMappedGaps,
687 preserveUnmappedGaps, sourceGapMappedLength, inExon,
688 trailingCopiedGap.length(), intronLength, startOfCodon);
689 for (int i = 0; i < gapsToAdd; i++)
691 thisAligned.append(myGapChar);
693 sourceGapMappedLength = 0;
696 thisAligned.append(c);
697 trailingCopiedGap = new StringBuilder();
701 if (inExon && preserveMappedGaps)
703 trailingCopiedGap.append(myGapChar);
705 else if (!inExon && preserveUnmappedGaps)
707 trailingCopiedGap.append(myGapChar);
714 * At end of model aligned sequence. Copy any remaining target sequence, optionally
715 * including (intron) gaps.
717 while (thisSeqPos < thisSeq.length)
719 final char c = thisSeq[thisSeqPos++];
720 if (c != myGapChar || preserveUnmappedGaps)
722 thisAligned.append(c);
724 sourceGapMappedLength--;
728 * finally add gaps to pad for any trailing source gaps or
729 * unmapped characters
731 if (preserveUnmappedGaps)
733 while (sourceGapMappedLength > 0)
735 thisAligned.append(myGapChar);
736 sourceGapMappedLength--;
741 * All done aligning, set the aligned sequence.
743 alignTo.setSequence(new String(thisAligned));
747 * Helper method to work out how many gaps to insert when realigning.
749 * @param preserveMappedGaps
750 * @param preserveUnmappedGaps
751 * @param sourceGapMappedLength
753 * @param trailingCopiedGap
754 * @param intronLength
755 * @param startOfCodon
758 protected static int calculateGapsToInsert(boolean preserveMappedGaps,
759 boolean preserveUnmappedGaps, int sourceGapMappedLength,
760 boolean inExon, int trailingGapLength, int intronLength,
761 final boolean startOfCodon)
767 * Reached start of codon. Ignore trailing gaps in intron unless we are
768 * preserving gaps in both exon and intron. Ignore them anyway if the
769 * protein alignment introduces a gap at least as large as the intronic
772 if (inExon && !preserveMappedGaps)
774 trailingGapLength = 0;
776 if (!inExon && !(preserveMappedGaps && preserveUnmappedGaps))
778 trailingGapLength = 0;
782 gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
786 if (intronLength + trailingGapLength <= sourceGapMappedLength)
788 gapsToAdd = sourceGapMappedLength - intronLength;
792 gapsToAdd = Math.min(intronLength + trailingGapLength
793 - sourceGapMappedLength, trailingGapLength);
800 * second or third base of codon; check for any gaps in dna
802 if (!preserveMappedGaps)
804 trailingGapLength = 0;
806 gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
812 * Realigns the given protein to match the alignment of the dna, using codon
813 * mappings to translate aligned codon positions to protein residues.
816 * the alignment whose sequences are realigned by this method
818 * the dna alignment whose alignment we are 'copying'
819 * @return the number of sequences that were realigned
821 public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna)
823 List<SequenceI> unmappedProtein = new ArrayList<SequenceI>();
824 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = buildCodonColumnsMap(
825 protein, dna, unmappedProtein);
826 return alignProteinAs(protein, alignedCodons, unmappedProtein);
830 * Builds a map whose key is an aligned codon position (3 alignment column
831 * numbers base 0), and whose value is a map from protein sequence to each
832 * protein's peptide residue for that codon. The map generates an ordering of
833 * the codons, and allows us to read off the peptides at each position in
834 * order to assemble 'aligned' protein sequences.
837 * the protein alignment
839 * the coding dna alignment
840 * @param unmappedProtein
841 * any unmapped proteins are added to this list
844 protected static Map<AlignedCodon, Map<SequenceI, AlignedCodon>> buildCodonColumnsMap(
845 AlignmentI protein, AlignmentI dna,
846 List<SequenceI> unmappedProtein)
849 * maintain a list of any proteins with no mappings - these will be
850 * rendered 'as is' in the protein alignment as we can't align them
852 unmappedProtein.addAll(protein.getSequences());
854 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
857 * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of
858 * {dnaSequence, {proteinSequence, codonProduct}} at that position. The
859 * comparator keeps the codon positions ordered.
861 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = new TreeMap<AlignedCodon, Map<SequenceI, AlignedCodon>>(
862 new CodonComparator());
864 for (SequenceI dnaSeq : dna.getSequences())
866 for (AlignedCodonFrame mapping : mappings)
868 SequenceI prot = mapping.findAlignedSequence(dnaSeq, protein);
871 Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
872 addCodonPositions(dnaSeq, prot, protein.getGapCharacter(),
873 seqMap, alignedCodons);
874 unmappedProtein.remove(prot);
880 * Finally add any unmapped peptide start residues (e.g. for incomplete
881 * codons) as if at the codon position before the second residue
883 // TODO resolve JAL-2022 so this fudge can be removed
884 int mappedSequenceCount = protein.getHeight() - unmappedProtein.size();
885 addUnmappedPeptideStarts(alignedCodons, mappedSequenceCount);
887 return alignedCodons;
891 * Scans for any protein mapped from position 2 (meaning unmapped start
892 * position e.g. an incomplete codon), and synthesizes a 'codon' for it at the
893 * preceding position in the alignment
895 * @param alignedCodons
896 * the codon-to-peptide map
897 * @param mappedSequenceCount
898 * the number of distinct sequences in the map
900 protected static void addUnmappedPeptideStarts(
901 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
902 int mappedSequenceCount)
904 // TODO delete this ugly hack once JAL-2022 is resolved
905 // i.e. we can model startPhase > 0 (incomplete start codon)
907 List<SequenceI> sequencesChecked = new ArrayList<SequenceI>();
908 AlignedCodon lastCodon = null;
909 Map<SequenceI, AlignedCodon> toAdd = new HashMap<SequenceI, AlignedCodon>();
911 for (Entry<AlignedCodon, Map<SequenceI, AlignedCodon>> entry : alignedCodons
914 for (Entry<SequenceI, AlignedCodon> sequenceCodon : entry.getValue()
917 SequenceI seq = sequenceCodon.getKey();
918 if (sequencesChecked.contains(seq))
922 sequencesChecked.add(seq);
923 AlignedCodon codon = sequenceCodon.getValue();
924 if (codon.peptideCol > 1)
927 .println("Problem mapping protein with >1 unmapped start positions: "
930 else if (codon.peptideCol == 1)
933 * first position (peptideCol == 0) was unmapped - add it
935 if (lastCodon != null)
937 AlignedCodon firstPeptide = new AlignedCodon(lastCodon.pos1,
938 lastCodon.pos2, lastCodon.pos3, String.valueOf(seq
940 toAdd.put(seq, firstPeptide);
945 * unmapped residue at start of alignment (no prior column) -
946 * 'insert' at nominal codon [0, 0, 0]
948 AlignedCodon firstPeptide = new AlignedCodon(0, 0, 0,
949 String.valueOf(seq.getCharAt(0)), 0);
950 toAdd.put(seq, firstPeptide);
953 if (sequencesChecked.size() == mappedSequenceCount)
955 // no need to check past first mapped position in all sequences
959 lastCodon = entry.getKey();
963 * add any new codons safely after iterating over the map
965 for (Entry<SequenceI, AlignedCodon> startCodon : toAdd.entrySet())
967 addCodonToMap(alignedCodons, startCodon.getValue(),
968 startCodon.getKey());
973 * Update the aligned protein sequences to match the codon alignments given in
977 * @param alignedCodons
978 * an ordered map of codon positions (columns), with sequence/peptide
979 * values present in each column
980 * @param unmappedProtein
983 protected static int alignProteinAs(AlignmentI protein,
984 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
985 List<SequenceI> unmappedProtein)
988 * Prefill aligned sequences with gaps before inserting aligned protein
991 int alignedWidth = alignedCodons.size();
992 char[] gaps = new char[alignedWidth];
993 Arrays.fill(gaps, protein.getGapCharacter());
994 String allGaps = String.valueOf(gaps);
995 for (SequenceI seq : protein.getSequences())
997 if (!unmappedProtein.contains(seq))
999 seq.setSequence(allGaps);
1004 for (AlignedCodon codon : alignedCodons.keySet())
1006 final Map<SequenceI, AlignedCodon> columnResidues = alignedCodons
1008 for (Entry<SequenceI, AlignedCodon> entry : columnResidues.entrySet())
1010 // place translated codon at its column position in sequence
1011 entry.getKey().getSequence()[column] = entry.getValue().product
1020 * Populate the map of aligned codons by traversing the given sequence
1021 * mapping, locating the aligned positions of mapped codons, and adding those
1022 * positions and their translation products to the map.
1025 * the aligned sequence we are mapping from
1027 * the sequence to be aligned to the codons
1029 * the gap character in the dna sequence
1031 * a mapping to a sequence translation
1032 * @param alignedCodons
1033 * the map we are building up
1035 static void addCodonPositions(SequenceI dna, SequenceI protein,
1036 char gapChar, Mapping seqMap,
1037 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons)
1039 Iterator<AlignedCodon> codons = seqMap.getCodonIterator(dna, gapChar);
1042 * add codon positions, and their peptide translations, to the alignment
1043 * map, while remembering the first codon mapped
1045 while (codons.hasNext())
1049 AlignedCodon codon = codons.next();
1050 addCodonToMap(alignedCodons, codon, protein);
1051 } catch (IncompleteCodonException e)
1053 // possible incomplete trailing codon - ignore
1054 } catch (NoSuchElementException e)
1056 // possibly peptide lacking STOP
1062 * Helper method to add a codon-to-peptide entry to the aligned codons map
1064 * @param alignedCodons
1068 protected static void addCodonToMap(
1069 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1070 AlignedCodon codon, SequenceI protein)
1072 Map<SequenceI, AlignedCodon> seqProduct = alignedCodons.get(codon);
1073 if (seqProduct == null)
1075 seqProduct = new HashMap<SequenceI, AlignedCodon>();
1076 alignedCodons.put(codon, seqProduct);
1078 seqProduct.put(protein, codon);
1082 * Returns true if a cDNA/Protein mapping either exists, or could be made,
1083 * between at least one pair of sequences in the two alignments. Currently,
1086 * <li>One alignment must be nucleotide, and the other protein</li>
1087 * <li>At least one pair of sequences must be already mapped, or mappable</li>
1088 * <li>Mappable means the nucleotide translation matches the protein sequence</li>
1089 * <li>The translation may ignore start and stop codons if present in the
1097 public static boolean isMappable(AlignmentI al1, AlignmentI al2)
1099 if (al1 == null || al2 == null)
1105 * Require one nucleotide and one protein
1107 if (al1.isNucleotide() == al2.isNucleotide())
1111 AlignmentI dna = al1.isNucleotide() ? al1 : al2;
1112 AlignmentI protein = dna == al1 ? al2 : al1;
1113 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1114 for (SequenceI dnaSeq : dna.getSequences())
1116 for (SequenceI proteinSeq : protein.getSequences())
1118 if (isMappable(dnaSeq, proteinSeq, mappings))
1128 * Returns true if the dna sequence is mapped, or could be mapped, to the
1136 protected static boolean isMappable(SequenceI dnaSeq,
1137 SequenceI proteinSeq, List<AlignedCodonFrame> mappings)
1139 if (dnaSeq == null || proteinSeq == null)
1144 SequenceI dnaDs = dnaSeq.getDatasetSequence() == null ? dnaSeq : dnaSeq
1145 .getDatasetSequence();
1146 SequenceI proteinDs = proteinSeq.getDatasetSequence() == null ? proteinSeq
1147 : proteinSeq.getDatasetSequence();
1149 for (AlignedCodonFrame mapping : mappings)
1151 if (proteinDs == mapping.getAaForDnaSeq(dnaDs))
1161 * Just try to make a mapping (it is not yet stored), test whether
1164 return mapCdnaToProtein(proteinDs, dnaDs) != null;
1168 * Finds any reference annotations associated with the sequences in
1169 * sequenceScope, that are not already added to the alignment, and adds them
1170 * to the 'candidates' map. Also populates a lookup table of annotation
1171 * labels, keyed by calcId, for use in constructing tooltips or the like.
1173 * @param sequenceScope
1174 * the sequences to scan for reference annotations
1175 * @param labelForCalcId
1176 * (optional) map to populate with label for calcId
1178 * map to populate with annotations for sequence
1180 * the alignment to check for presence of annotations
1182 public static void findAddableReferenceAnnotations(
1183 List<SequenceI> sequenceScope,
1184 Map<String, String> labelForCalcId,
1185 final Map<SequenceI, List<AlignmentAnnotation>> candidates,
1188 if (sequenceScope == null)
1194 * For each sequence in scope, make a list of any annotations on the
1195 * underlying dataset sequence which are not already on the alignment.
1197 * Add to a map of { alignmentSequence, <List of annotations to add> }
1199 for (SequenceI seq : sequenceScope)
1201 SequenceI dataset = seq.getDatasetSequence();
1202 if (dataset == null)
1206 AlignmentAnnotation[] datasetAnnotations = dataset.getAnnotation();
1207 if (datasetAnnotations == null)
1211 final List<AlignmentAnnotation> result = new ArrayList<AlignmentAnnotation>();
1212 for (AlignmentAnnotation dsann : datasetAnnotations)
1215 * Find matching annotations on the alignment. If none is found, then
1216 * add this annotation to the list of 'addable' annotations for this
1219 final Iterable<AlignmentAnnotation> matchedAlignmentAnnotations = al
1220 .findAnnotations(seq, dsann.getCalcId(), dsann.label);
1221 if (!matchedAlignmentAnnotations.iterator().hasNext())
1224 if (labelForCalcId != null)
1226 labelForCalcId.put(dsann.getCalcId(), dsann.label);
1231 * Save any addable annotations for this sequence
1233 if (!result.isEmpty())
1235 candidates.put(seq, result);
1241 * Adds annotations to the top of the alignment annotations, in the same order
1242 * as their related sequences.
1244 * @param annotations
1245 * the annotations to add
1247 * the alignment to add them to
1248 * @param selectionGroup
1249 * current selection group (or null if none)
1251 public static void addReferenceAnnotations(
1252 Map<SequenceI, List<AlignmentAnnotation>> annotations,
1253 final AlignmentI alignment, final SequenceGroup selectionGroup)
1255 for (SequenceI seq : annotations.keySet())
1257 for (AlignmentAnnotation ann : annotations.get(seq))
1259 AlignmentAnnotation copyAnn = new AlignmentAnnotation(ann);
1261 int endRes = ann.annotations.length;
1262 if (selectionGroup != null)
1264 startRes = selectionGroup.getStartRes();
1265 endRes = selectionGroup.getEndRes();
1267 copyAnn.restrict(startRes, endRes);
1270 * Add to the sequence (sets copyAnn.datasetSequence), unless the
1271 * original annotation is already on the sequence.
1273 if (!seq.hasAnnotation(ann))
1275 seq.addAlignmentAnnotation(copyAnn);
1278 copyAnn.adjustForAlignment();
1279 // add to the alignment and set visible
1280 alignment.addAnnotation(copyAnn);
1281 copyAnn.visible = true;
1287 * Set visibility of alignment annotations of specified types (labels), for
1288 * specified sequences. This supports controls like
1289 * "Show all secondary structure", "Hide all Temp factor", etc.
1291 * @al the alignment to scan for annotations
1293 * the types (labels) of annotations to be updated
1294 * @param forSequences
1295 * if not null, only annotations linked to one of these sequences are
1296 * in scope for update; if null, acts on all sequence annotations
1298 * if this flag is true, 'types' is ignored (label not checked)
1300 * if true, set visibility on, else set off
1302 public static void showOrHideSequenceAnnotations(AlignmentI al,
1303 Collection<String> types, List<SequenceI> forSequences,
1304 boolean anyType, boolean doShow)
1306 for (AlignmentAnnotation aa : al.getAlignmentAnnotation())
1308 if (anyType || types.contains(aa.label))
1310 if ((aa.sequenceRef != null)
1311 && (forSequences == null || forSequences
1312 .contains(aa.sequenceRef)))
1314 aa.visible = doShow;
1321 * Returns true if either sequence has a cross-reference to the other
1327 public static boolean haveCrossRef(SequenceI seq1, SequenceI seq2)
1329 // Note: moved here from class CrossRef as the latter class has dependencies
1330 // not availability to the applet's classpath
1331 return hasCrossRef(seq1, seq2) || hasCrossRef(seq2, seq1);
1335 * Returns true if seq1 has a cross-reference to seq2. Currently this assumes
1336 * that sequence name is structured as Source|AccessionId.
1342 public static boolean hasCrossRef(SequenceI seq1, SequenceI seq2)
1344 if (seq1 == null || seq2 == null)
1348 String name = seq2.getName();
1349 final DBRefEntry[] xrefs = seq1.getDBRefs();
1352 for (DBRefEntry xref : xrefs)
1354 String xrefName = xref.getSource() + "|" + xref.getAccessionId();
1355 // case-insensitive test, consistent with DBRefEntry.equalRef()
1356 if (xrefName.equalsIgnoreCase(name))
1366 * Constructs an alignment consisting of the mapped (CDS) regions in the given
1367 * nucleotide sequences, and updates mappings to match. The CDS sequences are
1368 * added to the original alignment's dataset, which is shared by the new
1369 * alignment. Mappings from nucleotide to CDS, and from CDS to protein, are
1370 * added to the alignment dataset.
1373 * aligned dna sequences
1375 * from dna to protein
1377 * @return an alignment whose sequences are the cds-only parts of the dna
1378 * sequences (or null if no mappings are found)
1380 public static AlignmentI makeCdsAlignment(SequenceI[] dna,
1381 List<AlignedCodonFrame> mappings, AlignmentI al)
1383 List<SequenceI> cdsSeqs = new ArrayList<SequenceI>();
1385 for (SequenceI seq : dna)
1387 AlignedCodonFrame cdsMappings = new AlignedCodonFrame();
1388 List<AlignedCodonFrame> seqMappings = MappingUtils
1389 .findMappingsForSequence(seq, mappings);
1390 List<AlignedCodonFrame> alignmentMappings = al.getCodonFrames();
1391 for (AlignedCodonFrame mapping : seqMappings)
1393 for (Mapping aMapping : mapping.getMappingsFromSequence(seq))
1395 SequenceI cdsSeq = makeCdsSequence(seq.getDatasetSequence(),
1397 cdsSeqs.add(cdsSeq);
1400 * add a mapping from CDS to the (unchanged) mapped to range
1402 List<int[]> cdsRange = Collections.singletonList(new int[] { 1,
1403 cdsSeq.getLength() });
1404 MapList map = new MapList(cdsRange, aMapping.getMap()
1405 .getToRanges(), aMapping.getMap().getFromRatio(),
1406 aMapping.getMap().getToRatio());
1407 cdsMappings.addMap(cdsSeq, aMapping.getTo(), map);
1410 * add another mapping from original 'from' range to CDS
1412 map = new MapList(aMapping.getMap().getFromRanges(), cdsRange, 1,
1414 cdsMappings.addMap(seq.getDatasetSequence(), cdsSeq, map);
1416 alignmentMappings.add(cdsMappings);
1419 * transfer any features on dna that overlap the CDS
1421 transferFeatures(seq, cdsSeq, map, null, SequenceOntologyI.CDS);
1427 * add CDS seqs to shared dataset
1429 Alignment dataset = al.getDataset();
1430 for (SequenceI seq : cdsSeqs)
1432 if (!dataset.getSequences().contains(seq.getDatasetSequence()))
1434 dataset.addSequence(seq.getDatasetSequence());
1437 AlignmentI cds = new Alignment(cdsSeqs.toArray(new SequenceI[cdsSeqs
1439 cds.setDataset(dataset);
1445 * Helper method that makes a CDS sequence as defined by the mappings from the
1446 * given sequence i.e. extracts the 'mapped from' ranges (which may be on
1447 * forward or reverse strand).
1453 static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping)
1455 char[] seqChars = seq.getSequence();
1456 List<int[]> fromRanges = mapping.getMap().getFromRanges();
1457 int cdsWidth = MappingUtils.getLength(fromRanges);
1458 char[] newSeqChars = new char[cdsWidth];
1461 for (int[] range : fromRanges)
1463 if (range[0] <= range[1])
1465 // forward strand mapping - just copy the range
1466 int length = range[1] - range[0] + 1;
1467 System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos,
1473 // reverse strand mapping - copy and complement one by one
1474 for (int i = range[0]; i >= range[1]; i--)
1476 newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]);
1481 SequenceI newSeq = new Sequence(seq.getName() + "|"
1482 + mapping.getTo().getName(), newSeqChars, 1, newPos);
1483 newSeq.createDatasetSequence();
1488 * Transfers co-located features on 'fromSeq' to 'toSeq', adjusting the
1489 * feature start/end ranges, optionally omitting specified feature types.
1490 * Returns the number of features copied.
1495 * if not null, only features of this type are copied (including
1496 * subtypes in the Sequence Ontology)
1498 * the mapping from 'fromSeq' to 'toSeq'
1501 public static int transferFeatures(SequenceI fromSeq, SequenceI toSeq,
1502 MapList mapping, String select, String... omitting)
1504 SequenceI copyTo = toSeq;
1505 while (copyTo.getDatasetSequence() != null)
1507 copyTo = copyTo.getDatasetSequence();
1510 SequenceOntologyI so = SequenceOntologyFactory.getInstance();
1512 SequenceFeature[] sfs = fromSeq.getSequenceFeatures();
1515 for (SequenceFeature sf : sfs)
1517 String type = sf.getType();
1518 if (select != null && !so.isA(type, select))
1522 boolean omit = false;
1523 for (String toOmit : omitting)
1525 if (type.equals(toOmit))
1536 * locate the mapped range - null if either start or end is
1537 * not mapped (no partial overlaps are calculated)
1539 int start = sf.getBegin();
1540 int end = sf.getEnd();
1541 int[] mappedTo = mapping.locateInTo(start, end);
1543 * if whole exon range doesn't map, try interpreting it
1544 * as 5' or 3' exon overlapping the CDS range
1546 if (mappedTo == null)
1548 mappedTo = mapping.locateInTo(end, end);
1549 if (mappedTo != null)
1552 * end of exon is in CDS range - 5' overlap
1553 * to a range from the start of the peptide
1558 if (mappedTo == null)
1560 mappedTo = mapping.locateInTo(start, start);
1561 if (mappedTo != null)
1564 * start of exon is in CDS range - 3' overlap
1565 * to a range up to the end of the peptide
1567 mappedTo[1] = toSeq.getLength();
1570 if (mappedTo != null)
1572 SequenceFeature copy = new SequenceFeature(sf);
1573 copy.setBegin(Math.min(mappedTo[0], mappedTo[1]));
1574 copy.setEnd(Math.max(mappedTo[0], mappedTo[1]));
1575 copyTo.addSequenceFeature(copy);
1584 * Returns a mapping from dna to protein by inspecting sequence features of
1585 * type "CDS" on the dna.
1591 public static MapList mapCdsToProtein(SequenceI dnaSeq,
1592 SequenceI proteinSeq)
1594 List<int[]> ranges = findCdsPositions(dnaSeq);
1595 int mappedDnaLength = MappingUtils.getLength(ranges);
1597 int proteinLength = proteinSeq.getLength();
1598 int proteinStart = proteinSeq.getStart();
1599 int proteinEnd = proteinSeq.getEnd();
1602 * incomplete start codon may mean X at start of peptide
1603 * we ignore both for mapping purposes
1605 if (proteinSeq.getCharAt(0) == 'X')
1607 // todo JAL-2022 support startPhase > 0
1611 List<int[]> proteinRange = new ArrayList<int[]>();
1614 * dna length should map to protein (or protein plus stop codon)
1616 int codesForResidues = mappedDnaLength / 3;
1617 if (codesForResidues == (proteinLength + 1))
1619 // assuming extra codon is for STOP and not in peptide
1622 if (codesForResidues == proteinLength)
1624 proteinRange.add(new int[] { proteinStart, proteinEnd });
1625 return new MapList(ranges, proteinRange, 3, 1);
1631 * Returns a list of CDS ranges found (as sequence positions base 1), i.e. of
1632 * start/end positions of sequence features of type "CDS" (or a sub-type of
1633 * CDS in the Sequence Ontology). The ranges are sorted into ascending start
1634 * position order, so this method is only valid for linear CDS in the same
1635 * sense as the protein product.
1640 public static List<int[]> findCdsPositions(SequenceI dnaSeq)
1642 List<int[]> result = new ArrayList<int[]>();
1643 SequenceFeature[] sfs = dnaSeq.getSequenceFeatures();
1649 SequenceOntologyI so = SequenceOntologyFactory.getInstance();
1652 for (SequenceFeature sf : sfs)
1655 * process a CDS feature (or a sub-type of CDS)
1657 if (so.isA(sf.getType(), SequenceOntologyI.CDS))
1662 phase = Integer.parseInt(sf.getPhase());
1663 } catch (NumberFormatException e)
1668 * phase > 0 on first codon means 5' incomplete - skip to the start
1669 * of the next codon; example ENST00000496384
1671 int begin = sf.getBegin();
1672 int end = sf.getEnd();
1673 if (result.isEmpty())
1678 // shouldn't happen!
1680 .println("Error: start phase extends beyond start CDS in "
1681 + dnaSeq.getName());
1684 result.add(new int[] { begin, end });
1689 * remove 'startPhase' positions (usually 0) from the first range
1690 * so we begin at the start of a complete codon
1692 if (!result.isEmpty())
1694 // TODO JAL-2022 correctly model start phase > 0
1695 result.get(0)[0] += startPhase;
1699 * Finally sort ranges by start position. This avoids a dependency on
1700 * keeping features in order on the sequence (if they are in order anyway,
1701 * the sort will have almost no work to do). The implicit assumption is CDS
1702 * ranges are assembled in order. Other cases should not use this method,
1703 * but instead construct an explicit mapping for CDS (e.g. EMBL parsing).
1705 Collections.sort(result, new Comparator<int[]>()
1708 public int compare(int[] o1, int[] o2)
1710 return Integer.compare(o1[0], o2[0]);
1717 * Maps exon features from dna to protein, and computes variants in peptide
1718 * product generated by variants in dna, and adds them as sequence_variant
1719 * features on the protein sequence. Returns the number of variant features
1724 * @param dnaToProtein
1726 public static int computeProteinFeatures(SequenceI dnaSeq,
1727 SequenceI peptide, MapList dnaToProtein)
1729 while (dnaSeq.getDatasetSequence() != null)
1731 dnaSeq = dnaSeq.getDatasetSequence();
1733 while (peptide.getDatasetSequence() != null)
1735 peptide = peptide.getDatasetSequence();
1738 transferFeatures(dnaSeq, peptide, dnaToProtein, SequenceOntologyI.EXON);
1741 * compute protein variants from dna variants and codon mappings;
1742 * NB - alternatively we could retrieve this using the REST service e.g.
1743 * http://rest.ensembl.org/overlap/translation
1744 * /ENSP00000288602?feature=transcript_variation;content-type=text/xml
1745 * which would be a bit slower but possibly more reliable
1747 LinkedHashMap<Integer, List<String[][]>> variants = buildDnaVariantsMap(
1748 dnaSeq, dnaToProtein);
1751 * scan codon variations, compute peptide variants and add to peptide sequence
1754 for (Entry<Integer, List<String[][]>> variant : variants.entrySet())
1756 int peptidePos = variant.getKey();
1757 List<String[][]> codonVariants = variant.getValue();
1758 String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based
1759 for (String[][] codonVariant : codonVariants)
1761 List<String> peptideVariants = computePeptideVariants(codonVariant,
1763 if (!peptideVariants.isEmpty())
1765 String desc = residue
1766 + "->" // include canonical residue in description
1768 .listToDelimitedString(peptideVariants, ", ");
1769 SequenceFeature sf = new SequenceFeature(
1770 SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos,
1771 peptidePos, 0f, null);
1772 peptide.addSequenceFeature(sf);
1779 * ugly sort to get sequence features in start position order
1780 * - would be better to store in Sequence as a TreeSet or NCList?
1782 Arrays.sort(peptide.getSequenceFeatures(),
1783 new Comparator<SequenceFeature>()
1786 public int compare(SequenceFeature o1, SequenceFeature o2)
1788 int c = Integer.compare(o1.getBegin(), o2.getBegin());
1789 return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd())
1797 * Builds a map whose key is position in the protein sequence, and value is an
1798 * array of all variants for the coding codon positions
1801 * @param dnaToProtein
1804 static LinkedHashMap<Integer, List<String[][]>> buildDnaVariantsMap(
1805 SequenceI dnaSeq, MapList dnaToProtein)
1808 * map from peptide position to all variant features of the codon for it
1809 * LinkedHashMap ensures we add the peptide features in sequence order
1811 LinkedHashMap<Integer, List<String[][]>> variants = new LinkedHashMap<Integer, List<String[][]>>();
1812 SequenceOntologyI so = SequenceOntologyFactory.getInstance();
1814 SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures();
1815 if (dnaFeatures == null)
1820 int dnaStart = dnaSeq.getStart();
1821 int[] lastCodon = null;
1822 int lastPeptidePostion = 0;
1825 * build a map of codon variations for peptides
1827 for (SequenceFeature sf : dnaFeatures)
1829 int dnaCol = sf.getBegin();
1830 if (dnaCol != sf.getEnd())
1832 // not handling multi-locus variant features
1835 if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT))
1837 int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
1840 // feature doesn't lie within coding region
1843 int peptidePosition = mapsTo[0];
1844 List<String[][]> codonVariants = variants.get(peptidePosition);
1845 if (codonVariants == null)
1847 codonVariants = new ArrayList<String[][]>();
1848 variants.put(peptidePosition, codonVariants);
1852 * extract dna variants to a string array
1854 String alls = (String) sf.getValue("alleles");
1859 String[] alleles = alls.toUpperCase().split(",");
1861 for (String allele : alleles)
1863 alleles[i++] = allele.trim(); // lose any space characters "A, G"
1867 * get this peptide's codon positions e.g. [3, 4, 5] or [4, 7, 10]
1869 int[] codon = peptidePosition == lastPeptidePostion ? lastCodon
1870 : MappingUtils.flattenRanges(dnaToProtein.locateInFrom(
1871 peptidePosition, peptidePosition));
1872 lastPeptidePostion = peptidePosition;
1876 * save nucleotide (and this variant) for each codon position
1878 String[][] codonVariant = new String[3][];
1879 for (int codonPos = 0; codonPos < 3; codonPos++)
1881 String nucleotide = String.valueOf(
1882 dnaSeq.getCharAt(codon[codonPos] - dnaStart))
1884 if (codonVariant[codonPos] == null)
1887 * record current dna base
1889 codonVariant[codonPos] = new String[] { nucleotide };
1891 if (codon[codonPos] == dnaCol)
1894 * add alleles to dna base (and any previously found alleles)
1896 String[] known = codonVariant[codonPos];
1897 String[] dnaVariants = new String[alleles.length + known.length];
1898 System.arraycopy(known, 0, dnaVariants, 0, known.length);
1899 System.arraycopy(alleles, 0, dnaVariants, known.length,
1901 codonVariant[codonPos] = dnaVariants;
1904 codonVariants.add(codonVariant);
1911 * Returns a sorted, non-redundant list of all peptide translations generated
1912 * by the given dna variants, excluding the current residue value
1914 * @param codonVariants
1915 * an array of base values (acgtACGT) for codon positions 1, 2, 3
1917 * the current residue translation
1920 static List<String> computePeptideVariants(String[][] codonVariants,
1923 List<String> result = new ArrayList<String>();
1924 for (String base1 : codonVariants[0])
1926 for (String base2 : codonVariants[1])
1928 for (String base3 : codonVariants[2])
1930 String codon = base1 + base2 + base3;
1932 * get peptide translation of codon e.g. GAT -> D
1933 * note that variants which are not single alleles,
1934 * e.g. multibase variants or HGMD_MUTATION etc
1937 String peptide = codon.contains("-") ? "-"
1938 : (codon.length() > 3 ? null : ResidueProperties
1939 .codonTranslate(codon));
1940 if (peptide != null && !result.contains(peptide)
1941 && !peptide.equalsIgnoreCase(residue))
1943 result.add(peptide);
1950 * sort alphabetically with STOP at the end
1952 Collections.sort(result, new Comparator<String>()
1956 public int compare(String o1, String o2)
1958 if ("STOP".equals(o1))
1962 else if ("STOP".equals(o2))
1968 return o1.compareTo(o2);
1976 * Makes an alignment with a copy of the given sequences, adding in any
1977 * non-redundant sequences which are mapped to by the cross-referenced
1984 public static AlignmentI makeCopyAlignment(SequenceI[] seqs,
1987 AlignmentI copy = new Alignment(new Alignment(seqs));
1989 SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
1992 for (SequenceI xref : xrefs)
1994 DBRefEntry[] dbrefs = xref.getDBRefs();
1997 for (DBRefEntry dbref : dbrefs)
1999 if (dbref.getMap() == null || dbref.getMap().getTo() == null)
2003 SequenceI mappedTo = dbref.getMap().getTo();
2004 SequenceI match = matcher.findIdMatch(mappedTo);
2007 matcher.add(mappedTo);
2008 copy.addSequence(mappedTo);
2018 * Try to align sequences in 'unaligned' to match the alignment of their
2019 * mapped regions in 'aligned'. For example, could use this to align CDS
2020 * sequences which are mapped to their parent cDNA sequences.
2022 * This method handles 1:1 mappings (dna-to-dna or protein-to-protein). For
2023 * dna-to-protein or protein-to-dna use alternative methods.
2026 * sequences to be aligned
2028 * holds aligned sequences and their mappings
2031 public static int alignAs(AlignmentI unaligned, AlignmentI aligned)
2033 List<SequenceI> unmapped = new ArrayList<SequenceI>();
2034 Map<Integer, Map<SequenceI, Character>> columnMap = buildMappedColumnsMap(
2035 unaligned, aligned, unmapped);
2036 int width = columnMap.size();
2037 char gap = unaligned.getGapCharacter();
2038 int realignedCount = 0;
2040 for (SequenceI seq : unaligned.getSequences())
2042 if (!unmapped.contains(seq))
2044 char[] newSeq = new char[width];
2045 Arrays.fill(newSeq, gap);
2050 * traverse the map to find columns populated
2053 for (Integer column : columnMap.keySet())
2055 Character c = columnMap.get(column).get(seq);
2059 * sequence has a character at this position
2069 * trim trailing gaps
2071 if (lastCol < width)
2073 char[] tmp = new char[lastCol + 1];
2074 System.arraycopy(newSeq, 0, tmp, 0, lastCol + 1);
2077 seq.setSequence(String.valueOf(newSeq));
2081 return realignedCount;
2085 * Returns a map whose key is alignment column number (base 1), and whose
2086 * values are a map of sequence characters in that column.
2093 static Map<Integer, Map<SequenceI, Character>> buildMappedColumnsMap(
2094 AlignmentI unaligned, AlignmentI aligned, List<SequenceI> unmapped)
2097 * Map will hold, for each aligned column position, a map of
2098 * {unalignedSequence, sequenceCharacter} at that position.
2099 * TreeMap keeps the entries in ascending column order.
2101 Map<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
2104 * r any sequences that have no mapping so can't be realigned
2106 unmapped.addAll(unaligned.getSequences());
2108 List<AlignedCodonFrame> mappings = aligned.getCodonFrames();
2110 for (SequenceI seq : unaligned.getSequences())
2112 for (AlignedCodonFrame mapping : mappings)
2114 SequenceI fromSeq = mapping.findAlignedSequence(seq, aligned);
2115 if (fromSeq != null)
2117 Mapping seqMap = mapping.getMappingBetween(fromSeq, seq);
2118 if (addMappedPositions(seq, fromSeq, seqMap, map))
2120 unmapped.remove(seq);
2129 * Helper method that adds to a map the mapped column positions of a sequence. <br>
2130 * For example if aaTT-Tg-gAAA is mapped to TTTAAA then the map should record
2131 * that columns 3,4,6,10,11,12 map to characters T,T,T,A,A,A of the mapped to
2135 * the sequence whose column positions we are recording
2137 * a sequence that is mapped to the first sequence
2139 * the mapping from 'fromSeq' to 'seq'
2141 * a map to add the column positions (in fromSeq) of the mapped
2145 static boolean addMappedPositions(SequenceI seq, SequenceI fromSeq,
2146 Mapping seqMap, Map<Integer, Map<SequenceI, Character>> map)
2153 char[] fromChars = fromSeq.getSequence();
2154 int toStart = seq.getStart();
2155 char[] toChars = seq.getSequence();
2158 * traverse [start, end, start, end...] ranges in fromSeq
2160 for (int[] fromRange : seqMap.getMap().getFromRanges())
2162 for (int i = 0; i < fromRange.length - 1; i += 2)
2164 boolean forward = fromRange[i + 1] >= fromRange[i];
2167 * find the range mapped to (sequence positions base 1)
2169 int[] range = seqMap.locateMappedRange(fromRange[i],
2173 System.err.println("Error in mapping " + seqMap + " from "
2174 + fromSeq.getName());
2177 int fromCol = fromSeq.findIndex(fromRange[i]);
2178 int mappedCharPos = range[0];
2181 * walk over the 'from' aligned sequence in forward or reverse
2182 * direction; when a non-gap is found, record the column position
2183 * of the next character of the mapped-to sequence; stop when all
2184 * the characters of the range have been counted
2186 while (mappedCharPos <= range[1])
2188 if (!Comparison.isGap(fromChars[fromCol - 1]))
2191 * mapped from sequence has a character in this column
2192 * record the column position for the mapped to character
2194 Map<SequenceI, Character> seqsMap = map.get(fromCol);
2195 if (seqsMap == null)
2197 seqsMap = new HashMap<SequenceI, Character>();
2198 map.put(fromCol, seqsMap);
2200 seqsMap.put(seq, toChars[mappedCharPos - toStart]);
2203 fromCol += (forward ? 1 : -1);
2210 // strictly temporary hack until proper criteria for aligning protein to cds
2211 // are in place; this is so Ensembl -> fetch xrefs Uniprot aligns the Uniprot
2212 public static boolean looksLikeEnsembl(AlignmentI alignment)
2214 for (SequenceI seq : alignment.getSequences())
2216 String name = seq.getName();
2217 if (!name.startsWith("ENSG") && !name.startsWith("ENST"))