/* * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) * Copyright (C) $$Year-Rel$$ The Jalview Authors * * This file is part of Jalview. * * Jalview is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * Jalview is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Jalview. If not, see . * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.datamodel; import java.util.AbstractList; import java.util.ArrayList; import java.util.List; import jalview.util.MapList; import jalview.util.MappingUtils; /** * Stores mapping between the columns of a protein alignment and a DNA alignment * and a list of individual codon to amino acid mappings between sequences. */ public class AlignedCodonFrame { /* * Data bean to hold mappings from one sequence to another */ public class SequenceToSequenceMapping { private SequenceI fromSeq; private Mapping mapping; SequenceToSequenceMapping(SequenceI from, Mapping map) { this.fromSeq = from; this.mapping = map; } /** * Readable representation for debugging only, not guaranteed not to change */ @Override public String toString() { return String.format("From %s %s", fromSeq.getName(), mapping.toString()); } /** * Returns a hashCode derived from the hashcodes of the mappings and fromSeq * * @see SequenceToSequenceMapping#hashCode() */ @Override public int hashCode() { return (fromSeq == null ? 0 : fromSeq.hashCode() * 31) + mapping.hashCode(); } /** * Answers true if the objects hold the same mapping between the same two * sequences * * @see Mapping#equals */ @Override public boolean equals(Object obj) { if (!(obj instanceof SequenceToSequenceMapping)) { return false; } SequenceToSequenceMapping that = (SequenceToSequenceMapping) obj; if (this.mapping == null) { return that.mapping == null; } // TODO: can simplify by asserting fromSeq is a dataset sequence return (this.fromSeq == that.fromSeq || (this.fromSeq != null && that.fromSeq != null && this.fromSeq.getDatasetSequence() != null && this.fromSeq.getDatasetSequence() == that.fromSeq .getDatasetSequence())) && this.mapping.equals(that.mapping); } public SequenceI getFromSeq() { return fromSeq; } public Mapping getMapping() { return mapping; } /** * Returns true if the mapping covers the full length of the given sequence. * This allows us to distinguish the CDS that codes for a protein from * another overlapping CDS in the parent dna sequence. * * @param seq * @return */ public boolean covers(SequenceI seq) { return covers(seq,false,false); } /** * * @param seq * @param localCover - when true - compare extent of seq's dataset sequence rather than the local extent * @param either - when true coverage is required for either seq or the mapped sequence * @return true if mapping covers full length of given sequence (or the other if either==true) */ public boolean covers(SequenceI seq, boolean localCover,boolean either) { List mappedRanges = null,otherRanges=null; MapList mapList = mapping.getMap(); int mstart=seq.getStart(),mend=seq.getEnd(),ostart,oend; ; if (fromSeq == seq || fromSeq == seq.getDatasetSequence()) { if (localCover && fromSeq !=seq) { mstart=fromSeq.getStart(); mend=fromSeq.getEnd(); } mappedRanges = mapList.getFromRanges(); otherRanges=mapList.getToRanges(); ostart=mapping.to.getStart(); oend=mapping.to.getEnd(); } else if (mapping.to == seq || mapping.to == seq.getDatasetSequence()) { if (localCover && mapping.to !=seq) { mstart=mapping.to.getStart(); mend=mapping.to.getEnd(); } mappedRanges = mapList.getToRanges(); otherRanges=mapList.getFromRanges(); ostart=fromSeq.getStart(); oend=fromSeq.getEnd(); } else { return false; } /* * check that each mapped range lies within the sequence range * (necessary for circular CDS - example EMBL:J03321:AAA91567) * and mapped length covers (at least) sequence length */ int length = countRange(mappedRanges,mstart,mend); if (length != -1) { // add 1 to mapped length to allow for a mapped stop codon if (length + 1 >= (mend - mstart + 1)) { return true; } } if (either) { // also check coverage of the other range length = countRange(otherRanges, ostart, oend); if (length != -1) { if (length + 1 >= (oend - ostart + 1)) { return true; } } } return false; } private int countRange(List mappedRanges,int mstart,int mend) { int length=0; for (int[] range : mappedRanges) { int from = Math.min(range[0], range[1]); int to = Math.max(range[0], range[1]); if (from < mstart || to > mend) { return -1; } length += (to - from + 1); } return length; } /** * Adds any regions mapped to or from position {@code pos} in sequence * {@code seq} to the given search results * * @param seq * @param pos * @param sr */ public void markMappedRegion(SequenceI seq, int pos, SearchResultsI sr) { int[] codon = null; SequenceI mappedSeq = null; SequenceI ds = seq.getDatasetSequence(); if (ds == null) { ds = seq; } if (this.fromSeq == seq || this.fromSeq == ds) { codon = this.mapping.map.locateInTo(pos, pos); mappedSeq = this.mapping.to; } else if (this.mapping.to == seq || this.mapping.to == ds) { codon = this.mapping.map.locateInFrom(pos, pos); mappedSeq = this.fromSeq; } if (codon != null) { for (int i = 0; i < codon.length; i += 2) { sr.addResult(mappedSeq, codon[i], codon[i + 1]); } } } } private List mappings; /** * Constructor */ public AlignedCodonFrame() { mappings = new ArrayList<>(); } /** * Adds a mapping between the dataset sequences for the associated dna and * protein sequence objects * * @param dnaseq * @param aaseq * @param map */ public void addMap(SequenceI dnaseq, SequenceI aaseq, MapList map) { addMap(dnaseq, aaseq, map, null); } /** * Adds a mapping between the dataset sequences for the associated dna and * protein sequence objects * * @param dnaseq * @param aaseq * @param map * @param mapFromId */ public void addMap(SequenceI dnaseq, SequenceI aaseq, MapList map, String mapFromId) { // JBPNote DEBUG! THIS ! // dnaseq.transferAnnotation(aaseq, mp); // aaseq.transferAnnotation(dnaseq, new Mapping(map.getInverse())); SequenceI fromSeq = (dnaseq.getDatasetSequence() == null) ? dnaseq : dnaseq.getDatasetSequence(); SequenceI toSeq = (aaseq.getDatasetSequence() == null) ? aaseq : aaseq.getDatasetSequence(); /* * if we already hold a mapping between these sequences, just add to it * note that 'adding' a duplicate map does nothing; this protects against * creating duplicate mappings in AlignedCodonFrame */ for (SequenceToSequenceMapping ssm : mappings) { if (ssm.fromSeq == fromSeq && ssm.mapping.to == toSeq) { ssm.mapping.map.addMapList(map); return; } } /* * otherwise, add a new sequence mapping */ Mapping mp = new Mapping(toSeq, map); mp.setMappedFromId(mapFromId); mappings.add(new SequenceToSequenceMapping(fromSeq, mp)); } public SequenceI[] getdnaSeqs() { // TODO return a list instead? // return dnaSeqs; List seqs = new ArrayList<>(); for (SequenceToSequenceMapping ssm : mappings) { seqs.add(ssm.fromSeq); } return seqs.toArray(new SequenceI[seqs.size()]); } public SequenceI[] getAaSeqs() { // TODO not used - remove? List seqs = new ArrayList<>(); for (SequenceToSequenceMapping ssm : mappings) { seqs.add(ssm.mapping.to); } return seqs.toArray(new SequenceI[seqs.size()]); } public MapList[] getdnaToProt() { List maps = new ArrayList<>(); for (SequenceToSequenceMapping ssm : mappings) { maps.add(ssm.mapping.map); } return maps.toArray(new MapList[maps.size()]); } public Mapping[] getProtMappings() { List maps = new ArrayList<>(); for (SequenceToSequenceMapping ssm : mappings) { maps.add(ssm.mapping); } return maps.toArray(new Mapping[maps.size()]); } /** * Returns the first mapping found which is to or from the given sequence, or * null if none is found * * @param seq * @return */ public Mapping getMappingForSequence(SequenceI seq) { SequenceI seqDs = seq.getDatasetSequence(); seqDs = seqDs != null ? seqDs : seq; for (SequenceToSequenceMapping ssm : mappings) { if (ssm.fromSeq == seqDs || ssm.mapping.to == seqDs) { return ssm.mapping; } } return null; } /** * Return the corresponding aligned or dataset aa sequence for given dna * sequence, null if not found. * * @param sequenceRef * @return */ public SequenceI getAaForDnaSeq(SequenceI dnaSeqRef) { SequenceI dnads = dnaSeqRef.getDatasetSequence(); for (SequenceToSequenceMapping ssm : mappings) { if (ssm.fromSeq == dnaSeqRef || ssm.fromSeq == dnads) { return ssm.mapping.to; } } return null; } /** * Return the corresponding aligned or dataset dna sequence for given amino * acid sequence, or null if not found. returns the sequence from the first * mapping found that involves the protein sequence. * * @param aaSeqRef * @return */ public SequenceI getDnaForAaSeq(SequenceI aaSeqRef) { SequenceI aads = aaSeqRef.getDatasetSequence(); for (SequenceToSequenceMapping ssm : mappings) { if (ssm.mapping.to == aaSeqRef || ssm.mapping.to == aads) { return ssm.fromSeq; } } return null; } /** * test to see if codon frame involves seq in any way * * @param seq * a nucleotide or protein sequence * @return true if a mapping exists to or from this sequence to any translated * sequence */ public boolean involvesSequence(SequenceI seq) { return getAaForDnaSeq(seq) != null || getDnaForAaSeq(seq) != null; } /** * Add search results for regions in other sequences that translate or are * translated from a particular position in seq (which may be an aligned or * dataset sequence) * * @param seq * @param index * position in seq * @param results * where highlighted regions go */ public void markMappedRegion(SequenceI seq, int index, SearchResultsI results) { SequenceI ds = seq.getDatasetSequence(); if (ds == null) { ds = seq; } for (SequenceToSequenceMapping ssm : mappings) { ssm.markMappedRegion(ds, index, results); } } /** * Returns the DNA codon positions (base 1) for the given position (base 1) in * a mapped protein sequence, or null if no mapping is found. * * Intended for use in aligning cDNA to match aligned protein. Only the first * mapping found is returned, so not suitable for use if multiple protein * sequences are mapped to the same cDNA (but aligning cDNA as protein is * ill-defined for this case anyway). * * @param seq * the DNA dataset sequence * @param aaPos * residue position (base 1) in a protein sequence * @return */ public int[] getDnaPosition(SequenceI seq, int aaPos) { /* * Adapted from markMappedRegion(). */ MapList ml = null; int i = 0; for (SequenceToSequenceMapping ssm : mappings) { if (ssm.fromSeq == seq) { ml = getdnaToProt()[i]; break; } i++; } return ml == null ? null : ml.locateInFrom(aaPos, aaPos); } /** * Convenience method to return the first aligned sequence in the given * alignment whose dataset has a mapping with the given (aligned or dataset) * sequence. * * @param seq * * @param al * @return */ public SequenceI findAlignedSequence(SequenceI seq, AlignmentI al) { return findAlignedSequence(seq, al, null); } /** * Convenience method to return the first aligned sequence in the given * alignment whose dataset has a mapping with the given (aligned or dataset) * sequence, and optionally the mapping that relates them * * @param seq * @param al * @param map - list to add the mapping to * @return sequence from al that maps to seq */ public SequenceI findAlignedSequence(SequenceI seq, AlignmentI al,List map) { /* * Search mapped protein ('to') sequences first. */ for (SequenceToSequenceMapping ssm : mappings) { int mStart=ssm.getMapping().getMap().getFromLowest(),mEnd=ssm.getMapping().map.getFromHighest(); if ((ssm.fromSeq == seq || ssm.fromSeq == seq.getDatasetSequence()) // here AlignmentUtilsTest. testAlignProteinAsDna_incompleteStartCodon fails because mStart/mEnd is contained by seq // without this filter, we don't get the correct mapping, however )// && seq.getStart()>=mStart && seq.getEnd()<=mEnd) { for (SequenceI sourceAligned : al.getSequences()) { if (ssm.covers(sourceAligned,true,false)) { if (map != null) { map.add(ssm); } return sourceAligned; } } } } /* * Then try mapped dna sequences. */ for (SequenceToSequenceMapping ssm : mappings) { int mStart=ssm.getMapping().getMap().getToLowest(),mEnd=ssm.getMapping().map.getToHighest(); if ((ssm.mapping.to == seq || ssm.mapping.to == seq.getDatasetSequence()) && seq.getStart()>=mStart && seq.getEnd()<=mEnd) { for (SequenceI sourceAligned : al.getSequences()) { if (ssm.covers(sourceAligned,true,true)) { if (map != null) { map.add(ssm); } return sourceAligned; } } } } return null; } /** * Returns the region in the target sequence's dataset that is mapped to the * given position (base 1) in the query sequence's dataset. The region is a * set of start/end position pairs. * * @param target * @param query * @param queryPos * @return */ public int[] getMappedRegion(SequenceI target, SequenceI query, int queryPos) { SequenceI targetDs = target.getDatasetSequence() == null ? target : target.getDatasetSequence(); SequenceI queryDs = query.getDatasetSequence() == null ? query : query.getDatasetSequence(); if (targetDs == null || queryDs == null /*|| dnaToProt == null*/) { return null; } for (SequenceToSequenceMapping ssm : mappings) { /* * try mapping from target to query */ if (ssm.fromSeq == targetDs && ssm.mapping.to == queryDs) { int[] codon = ssm.mapping.map.locateInFrom(queryPos, queryPos); if (codon != null) { return codon; } } /* * else try mapping from query to target */ else if (ssm.fromSeq == queryDs && ssm.mapping.to == targetDs) { int[] codon = ssm.mapping.map.locateInTo(queryPos, queryPos); if (codon != null) { return codon; } } } return null; } /** * Returns the mapped DNA codons for the given position in a protein sequence, * or null if no mapping is found. Returns a list of (e.g.) ['g', 'c', 't'] * codons. There may be more than one codon mapped to the protein if (for * example), there are mappings to cDNA variants. * * @param protein * the peptide dataset sequence * @param aaPos * residue position (base 1) in the peptide sequence * @return */ public List getMappedCodons(SequenceI protein, int aaPos) { MapList ml = null; SequenceI dnaSeq = null; List result = new ArrayList<>(); for (SequenceToSequenceMapping ssm : mappings) { if (ssm.mapping.to == protein && ssm.mapping.getMap().getFromRatio() == 3) { ml = ssm.mapping.map; dnaSeq = ssm.fromSeq; int[] codonPos = ml.locateInFrom(aaPos, aaPos); if (codonPos == null) { return null; } /* * Read off the mapped nucleotides (converting to position base 0) */ codonPos = MappingUtils.flattenRanges(codonPos); int start = dnaSeq.getStart(); char c1 = dnaSeq.getCharAt(codonPos[0] - start); char c2 = dnaSeq.getCharAt(codonPos[1] - start); char c3 = dnaSeq.getCharAt(codonPos[2] - start); result.add(new char[] { c1, c2, c3 }); } } return result.isEmpty() ? null : result; } /** * Returns any mappings found which are from the given sequence, and to * distinct sequences. * * @param seq * @return */ public List getMappingsFromSequence(SequenceI seq) { List result = new ArrayList<>(); List related = new ArrayList<>(); SequenceI seqDs = seq.getDatasetSequence(); seqDs = seqDs != null ? seqDs : seq; for (SequenceToSequenceMapping ssm : mappings) { final Mapping mapping = ssm.mapping; if (ssm.fromSeq == seqDs) { if (!related.contains(mapping.to)) { result.add(mapping); related.add(mapping.to); } } } return result; } /** * Test whether the given sequence is substitutable for one or more dummy * sequences in this mapping * * @param map * @param seq * @return */ public boolean isRealisableWith(SequenceI seq) { return realiseWith(seq, false) > 0; } /** * Replace any matchable mapped dummy sequences with the given real one. * Returns the count of sequence mappings instantiated. * * @param seq * @return */ public int realiseWith(SequenceI seq) { return realiseWith(seq, true); } /** * Returns the number of mapped dummy sequences that could be replaced with * the given real sequence. * * @param seq * a dataset sequence * @param doUpdate * if true, performs replacements, else only counts * @return */ protected int realiseWith(SequenceI seq, boolean doUpdate) { SequenceI ds = seq.getDatasetSequence() != null ? seq.getDatasetSequence() : seq; int count = 0; /* * check for replaceable DNA ('map from') sequences */ for (SequenceToSequenceMapping ssm : mappings) { SequenceI dna = ssm.fromSeq; if (dna instanceof SequenceDummy && dna.getName().equals(ds.getName())) { Mapping mapping = ssm.mapping; int mapStart = mapping.getMap().getFromLowest(); int mapEnd = mapping.getMap().getFromHighest(); boolean mappable = couldRealiseSequence(dna, ds, mapStart, mapEnd); if (mappable) { count++; if (doUpdate) { // TODO: new method ? ds.realise(dna); // might want to copy database refs as well ds.setSequenceFeatures(dna.getSequenceFeatures()); // dnaSeqs[i] = ds; ssm.fromSeq = ds; System.out.println("Realised mapped sequence " + ds.getName()); } } } /* * check for replaceable protein ('map to') sequences */ Mapping mapping = ssm.mapping; SequenceI prot = mapping.getTo(); int mapStart = mapping.getMap().getToLowest(); int mapEnd = mapping.getMap().getToHighest(); boolean mappable = couldRealiseSequence(prot, ds, mapStart, mapEnd); if (mappable) { count++; if (doUpdate) { // TODO: new method ? ds.realise(dna); // might want to copy database refs as well ds.setSequenceFeatures(dna.getSequenceFeatures()); ssm.mapping.setTo(ds); } } } return count; } /** * Helper method to test whether a 'real' sequence could replace a 'dummy' * sequence in the map. The criteria are that they have the same name, and * that the mapped region overlaps the candidate sequence. * * @param existing * @param replacement * @param mapStart * @param mapEnd * @return */ protected static boolean couldRealiseSequence(SequenceI existing, SequenceI replacement, int mapStart, int mapEnd) { if (existing instanceof SequenceDummy && !(replacement instanceof SequenceDummy) && existing.getName().equals(replacement.getName())) { int start = replacement.getStart(); int end = replacement.getEnd(); boolean mappingOverlapsSequence = (mapStart >= start && mapStart <= end) || (mapEnd >= start && mapEnd <= end); if (mappingOverlapsSequence) { return true; } } return false; } /** * Change any mapping to the given sequence to be to its dataset sequence * instead. For use when mappings are created before their referenced * sequences are instantiated, for example when parsing GFF data. * * @param seq */ public void updateToDataset(SequenceI seq) { if (seq == null || seq.getDatasetSequence() == null) { return; } SequenceI ds = seq.getDatasetSequence(); for (SequenceToSequenceMapping ssm : mappings) /* * 'from' sequences */ { if (ssm.fromSeq == seq) { ssm.fromSeq = ds; } /* * 'to' sequences */ if (ssm.mapping.to == seq) { ssm.mapping.to = ds; } } } /** * Answers true if this object contains no mappings * * @return */ public boolean isEmpty() { return mappings.isEmpty(); } /** * Method for debug / inspection purposes only, may change in future */ @Override public String toString() { return mappings == null ? "null" : mappings.toString(); } /** * Returns the first mapping found that is between 'fromSeq' and 'toSeq', or * null if none found * * @param fromSeq * aligned or dataset sequence * @param toSeq * aligned or dataset sequence * @return */ public Mapping getMappingBetween(SequenceI fromSeq, SequenceI toSeq) { SequenceI dssFrom = fromSeq.getDatasetSequence() == null ? fromSeq : fromSeq.getDatasetSequence(); SequenceI dssTo = toSeq.getDatasetSequence() == null ? toSeq : toSeq.getDatasetSequence(); for (SequenceToSequenceMapping mapping : mappings) { SequenceI from = mapping.fromSeq; SequenceI to = mapping.mapping.to; if ((from == dssFrom && to == dssTo) || (from == dssTo && to == dssFrom)) { return mapping.mapping; } } return null; } /** * Returns a hashcode derived from the list of sequence mappings * * @see SequenceToSequenceMapping#hashCode() * @see AbstractList#hashCode() */ @Override public int hashCode() { return this.mappings.hashCode(); } /** * Two AlignedCodonFrame objects are equal if they hold the same ordered list * of mappings * * @see SequenceToSequenceMapping#equals */ @Override public boolean equals(Object obj) { if (!(obj instanceof AlignedCodonFrame)) { return false; } return this.mappings.equals(((AlignedCodonFrame) obj).mappings); } public List getMappings() { return mappings; } /** * Returns the first mapping found which is between the two given sequences, * and covers the full extent of both. * * @param seq1 * @param seq2 * @return */ public SequenceToSequenceMapping getCoveringMapping(SequenceI seq1, SequenceI seq2) { for (SequenceToSequenceMapping mapping : mappings) { if (mapping.covers(seq2) && mapping.covers(seq1)) { return mapping; } } return null; } /** * Returns the first mapping found which is between the given dataset sequence * and another, is a triplet mapping (3:1 or 1:3), and covers the full extent * of both sequences involved * * @param seq * @return */ public SequenceToSequenceMapping getCoveringCodonMapping(SequenceI seq) { for (SequenceToSequenceMapping mapping : mappings) { if (mapping.getMapping().getMap().isTripletMap() && mapping.covers(seq)) { if (mapping.fromSeq == seq && mapping.covers(mapping.getMapping().getTo())) { return mapping; } else if (mapping.getMapping().getTo() == seq && mapping.covers(mapping.fromSeq)) { return mapping; } } } return null; } }