X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fanalysis%2FAlignmentUtils.java;h=d8cb9a2229159c3fbe01193d69811edd775ae3dc;hb=3647c311e9cf5b4645aaadab741dd81731116e56;hp=898e4f78adad719ac1e5a19bb41346b0057bc214;hpb=a207f4080b01543b48b80f6f3eb331b75b63bdc6;p=jalview.git diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index 898e4f7..d8cb9a2 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -31,6 +31,7 @@ import jalview.datamodel.FeatureProperties; import jalview.datamodel.Mapping; import jalview.datamodel.SearchResults; import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceGroup; import jalview.datamodel.SequenceI; import jalview.schemes.ResidueProperties; @@ -45,7 +46,6 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; -import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -338,12 +338,12 @@ public class AlignmentUtils * Answers true if the mappings include one between the given (dataset) * sequences. */ - public static boolean mappingExists(Set set, + public static boolean mappingExists(List mappings, SequenceI aaSeq, SequenceI cdnaSeq) { - if (set != null) + if (mappings != null) { - for (AlignedCodonFrame acf : set) + for (AlignedCodonFrame acf : mappings) { if (cdnaSeq == acf.getDnaForAaSeq(aaSeq)) { @@ -514,8 +514,8 @@ public class AlignmentUtils /* * Locate the aligned source sequence whose dataset sequence is mapped. We - * just take the first match here (as we can't align cDNA like more than one - * protein sequence). + * just take the first match here (as we can't align like more than one + * sequence). */ SequenceI alignFrom = null; AlignedCodonFrame mapping = null; @@ -541,8 +541,8 @@ public class AlignmentUtils /** * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to * match residues and codons. Flags control whether existing gaps in unmapped - * (intron) and mapped (exon) regions are preserved or not. Gaps linking intro - * and exon are only retained if both flags are set. + * (intron) and mapped (exon) regions are preserved or not. Gaps between + * intron and exon are only retained if both flags are set. * * @param alignTo * @param alignFrom @@ -558,9 +558,6 @@ public class AlignmentUtils boolean preserveUnmappedGaps) { // TODO generalise to work for Protein-Protein, dna-dna, dna-protein - final char[] thisSeq = alignTo.getSequence(); - final char[] thatAligned = alignFrom.getSequence(); - StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length); // aligned and dataset sequence positions, all base zero int thisSeqPos = 0; @@ -570,13 +567,17 @@ public class AlignmentUtils char myGapChar = myGap.charAt(0); int ratio = myGap.length(); - /* - * Traverse the aligned protein sequence. - */ int fromOffset = alignFrom.getStart() - 1; int toOffset = alignTo.getStart() - 1; int sourceGapMappedLength = 0; boolean inExon = false; + final char[] thisSeq = alignTo.getSequence(); + final char[] thatAligned = alignFrom.getSequence(); + StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length); + + /* + * Traverse the 'model' aligned sequence + */ for (char sourceChar : thatAligned) { if (sourceChar == sourceGap) @@ -586,7 +587,7 @@ public class AlignmentUtils } /* - * Found a residue. Locate its mapped codon (start) position. + * Found a non-gap character. Locate its mapped region if any. */ sourceDsPos++; // Note mapping positions are base 1, our sequence positions base 0 @@ -595,11 +596,13 @@ public class AlignmentUtils if (mappedPos == null) { /* - * Abort realignment if unmapped protein. Or could ignore it?? + * unmapped position; treat like a gap */ - System.err.println("Can't align: no codon mapping to residue " - + sourceDsPos + "(" + sourceChar + ")"); - return; + sourceGapMappedLength += ratio; + // System.err.println("Can't align: no codon mapping to residue " + // + sourceDsPos + "(" + sourceChar + ")"); + // return; + continue; } int mappedCodonStart = mappedPos[0]; // position (1...) of codon start @@ -669,8 +672,8 @@ public class AlignmentUtils } /* - * At end of protein sequence. Copy any remaining dna sequence, optionally - * including (intron) gaps. We do not copy trailing gaps in protein. + * At end of model aligned sequence. Copy any remaining target sequence, optionally + * including (intron) gaps. */ while (thisSeqPos < thisSeq.length) { @@ -679,6 +682,20 @@ public class AlignmentUtils { thisAligned.append(c); } + sourceGapMappedLength--; + } + + /* + * finally add gaps to pad for any trailing source gaps or + * unmapped characters + */ + if (preserveUnmappedGaps) + { + while (sourceGapMappedLength > 0) + { + thisAligned.append(myGapChar); + sourceGapMappedLength--; + } } /* @@ -909,7 +926,7 @@ public class AlignmentUtils List unmappedProtein = new ArrayList(); unmappedProtein.addAll(protein.getSequences()); - Set mappings = protein.getCodonFrames(); + List mappings = protein.getCodonFrames(); /* * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of @@ -1048,7 +1065,7 @@ public class AlignmentUtils } AlignmentI dna = al1.isNucleotide() ? al1 : al2; AlignmentI protein = dna == al1 ? al2 : al1; - Set mappings = protein.getCodonFrames(); + List mappings = protein.getCodonFrames(); for (SequenceI dnaSeq : dna.getSequences()) { for (SequenceI proteinSeq : protein.getSequences()) @@ -1072,7 +1089,7 @@ public class AlignmentUtils * @return */ protected static boolean isMappable(SequenceI dnaSeq, - SequenceI proteinSeq, Set mappings) + SequenceI proteinSeq, List mappings) { if (dnaSeq == null || proteinSeq == null) { @@ -1084,13 +1101,13 @@ public class AlignmentUtils SequenceI proteinDs = proteinSeq.getDatasetSequence() == null ? proteinSeq : proteinSeq.getDatasetSequence(); - /* - * Already mapped? - */ for (AlignedCodonFrame mapping : mappings) { if (proteinDs == mapping.getAaForDnaSeq(dnaDs)) { + /* + * already mapped + */ return true; } } @@ -1301,21 +1318,21 @@ public class AlignmentUtils } /** - * Constructs an alignment consisting of the mapped exon regions in the given + * Constructs an alignment consisting of the mapped cds regions in the given * nucleotide sequences, and updates mappings to match. * * @param dna * aligned dna sequences * @param mappings * from dna to protein; these are replaced with new mappings - * @return an alignment whose sequences are the exon-only parts of the dna - * sequences (or null if no exons are found) + * @return an alignment whose sequences are the cds-only parts of the dna + * sequences (or null if no cds are found) */ - public static AlignmentI makeExonAlignment(SequenceI[] dna, - Set mappings) + public static AlignmentI makeCdsAlignment(SequenceI[] dna, + List mappings) { - Set newMappings = new LinkedHashSet(); - List exonSequences = new ArrayList(); + List newMappings = new ArrayList(); + List cdsSequences = new ArrayList(); for (SequenceI dnaSeq : dna) { @@ -1325,17 +1342,17 @@ public class AlignmentUtils for (AlignedCodonFrame acf : seqMappings) { AlignedCodonFrame newMapping = new AlignedCodonFrame(); - final List mappedExons = makeExonSequences(ds, acf, + final List mappedCds = makeCdsSequences(ds, acf, newMapping); - if (!mappedExons.isEmpty()) + if (!mappedCds.isEmpty()) { - exonSequences.addAll(mappedExons); + cdsSequences.addAll(mappedCds); newMappings.add(newMapping); } } } AlignmentI al = new Alignment( - exonSequences.toArray(new SequenceI[exonSequences.size()])); + cdsSequences.toArray(new SequenceI[cdsSequences.size()])); al.setDataset(null); /* @@ -1348,86 +1365,203 @@ public class AlignmentUtils } /** - * Helper method to make exon-only sequences and populate their mappings to + * Helper method to make cds-only sequences and populate their mappings to * protein products *

* For example, if ggCCaTTcGAg has mappings [3, 4, 6, 7, 9, 10] to protein * then generate a sequence CCTTGA with mapping [1, 6] to the same protein * residues *

- * Typically eukaryotic dna will include exons encoding for a single peptide + * Typically eukaryotic dna will include cds encoding for a single peptide * sequence i.e. return a single result. Bacterial dna may have overlapping - * exon mappings coding for multiple peptides so return multiple results + * cds mappings coding for multiple peptides so return multiple results * (example EMBL KF591215). * * @param dnaSeq * a dna dataset sequence * @param mapping * containing one or more mappings of the sequence to protein - * @param newMapping - * the new mapping to populate, from the exon-only sequences to their + * @param newMappings + * the new mapping to populate, from the cds-only sequences to their * mapped protein sequences * @return */ - protected static List makeExonSequences(SequenceI dnaSeq, - AlignedCodonFrame mapping, AlignedCodonFrame newMapping) + protected static List makeCdsSequences(SequenceI dnaSeq, + AlignedCodonFrame mapping, AlignedCodonFrame newMappings) { - List exonSequences = new ArrayList(); + List cdsSequences = new ArrayList(); List seqMappings = mapping.getMappingsForSequence(dnaSeq); - final char[] dna = dnaSeq.getSequence(); + for (Mapping seqMapping : seqMappings) { - StringBuilder newSequence = new StringBuilder(dnaSeq.getLength()); + SequenceI cds = makeCdsSequence(dnaSeq, seqMapping, newMappings); + cdsSequences.add(cds); + + /* + * add new mappings, from dna to cds, and from cds to peptide + */ + MapList dnaToCds = addCdsMappings(dnaSeq, cds, seqMapping, + newMappings); /* - * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc } + * transfer any features on dna that overlap the CDS */ - final List dnaExonRanges = seqMapping.getMap().getFromRanges(); - for (int[] range : dnaExonRanges) + transferFeatures(dnaSeq, cds, dnaToCds, "CDS" /* SequenceOntology.CDS */); + } + return cdsSequences; + } + + /** + * Transfers any co-located features on 'fromSeq' to 'toSeq', adjusting the + * feature start/end ranges, optionally omitting specified feature types. + * + * @param fromSeq + * @param toSeq + * @param mapping + * the mapping from 'fromSeq' to 'toSeq' + * @param omitting + */ + protected static void transferFeatures(SequenceI fromSeq, + SequenceI toSeq, MapList mapping, String... omitting) + { + SequenceFeature[] sfs = fromSeq.getSequenceFeatures(); + if (sfs != null) + { + for (SequenceFeature sf : sfs) { - for (int pos = range[0]; pos <= range[1]; pos++) + String type = sf.getType(); + boolean omit = false; + for (String toOmit : omitting) + { + if (type.equals(toOmit)) + { + omit = true; + } + } + if (omit) + { + continue; + } + + /* + * locate the mapped range - null if either start or end is + * not mapped (no partial overlaps are calculated) + */ + int[] mappedTo = mapping.locateInTo(sf.getBegin(), sf.getEnd()); + if (mappedTo != null) { - newSequence.append(dna[pos - 1]); + SequenceFeature copy = new SequenceFeature(sf); + copy.setBegin(Math.min(mappedTo[0], mappedTo[1])); + copy.setEnd(Math.max(mappedTo[0], mappedTo[1])); + toSeq.addSequenceFeature(copy); } } + } + } - SequenceI exon = new Sequence(dnaSeq.getName(), - newSequence.toString()); + /** + * Creates and adds mappings + *

    + *
  • from cds to peptide
  • + *
  • from dna to cds
  • + *
+ * and returns the dna-to-cds mapping + * + * @param dnaSeq + * @param cdsSeq + * @param dnaMapping + * @param newMappings + * @return + */ + protected static MapList addCdsMappings(SequenceI dnaSeq, + SequenceI cdsSeq, + Mapping dnaMapping, AlignedCodonFrame newMappings) + { + cdsSeq.createDatasetSequence(); - /* - * Locate any xrefs to CDS database on the protein product and attach to - * the CDS sequence. Also add as a sub-token of the sequence name. - */ - // default to "CDS" if we can't locate an actual gene id - String cdsAccId = FeatureProperties - .getCodingFeature(DBRefSource.EMBL); - DBRefEntry[] cdsRefs = DBRefUtils.selectRefs(seqMapping.getTo() - .getDBRef(), DBRefSource.CODINGDBS); - if (cdsRefs != null) + /* + * CDS to peptide is just a contiguous 3:1 mapping, with + * the peptide ranges taken unchanged from the dna mapping + */ + List cdsRanges = new ArrayList(); + cdsRanges.add(new int[] { 1, cdsSeq.getLength() }); + MapList cdsToPeptide = new MapList(cdsRanges, dnaMapping.getMap() + .getToRanges(), 3, 1); + newMappings.addMap(cdsSeq.getDatasetSequence(), dnaMapping.getTo(), + cdsToPeptide); + + /* + * dna 'from' ranges map 1:1 to the contiguous extracted CDS + */ + MapList dnaToCds = new MapList( + dnaMapping.getMap().getFromRanges(), cdsRanges, 1, 1); + newMappings.addMap(dnaSeq, cdsSeq.getDatasetSequence(), dnaToCds); + return dnaToCds; + } + + /** + * Makes and returns a CDS-only sequence, where the CDS regions are identified + * as the 'from' ranges of the mapping on the dna. Any sequence features on + * the dna which overlap the CDS regions are copied to the new sequence. + * + * @param dnaSeq + * nucleotide sequence + * @param seqMapping + * mappings from CDS regions of nucleotide + * @param exonMappings + * CDS-to-peptide mapping (to add to) + * @return + */ + protected static SequenceI makeCdsSequence(SequenceI dnaSeq, + Mapping seqMapping, AlignedCodonFrame exonMappings) + { + StringBuilder newSequence = new StringBuilder(dnaSeq.getLength()); + final char[] dna = dnaSeq.getSequence(); + int offset = dnaSeq.getStart() - 1; + + /* + * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc } + */ + final List dnaCdsRanges = seqMapping.getMap().getFromRanges(); + for (int[] range : dnaCdsRanges) + { + // TODO handle reverse mapping as well (range[1] < range[0]) + for (int pos = range[0]; pos <= range[1]; pos++) { - for (DBRefEntry cdsRef : cdsRefs) - { - exon.addDBRef(new DBRefEntry(cdsRef)); - cdsAccId = cdsRef.getAccessionId(); - } + newSequence.append(dna[pos - offset - 1]); } - exon.setName(exon.getName() + "|" + cdsAccId); - exon.createDatasetSequence(); + } - /* - * Build new mappings - from the same protein regions, but now to - * contiguous exons - */ - List exonRange = new ArrayList(); - exonRange.add(new int[] { 1, newSequence.length() }); - MapList map = new MapList(exonRange, seqMapping.getMap() - .getToRanges(), 3, 1); - newMapping.addMap(exon.getDatasetSequence(), seqMapping.getTo(), map); - MapList cdsToDnaMap = new MapList(dnaExonRanges, exonRange, 1, 1); - newMapping.addMap(dnaSeq, exon.getDatasetSequence(), cdsToDnaMap); - - exonSequences.add(exon); + SequenceI cds = new Sequence(dnaSeq.getName(), + newSequence.toString()); + + transferDbRefs(seqMapping.getTo(), cds); + return cds; + } + + /** + * Locate any xrefs to CDS databases on the protein product and attach to the + * CDS sequence. Also add as a sub-token of the sequence name. + * + * @param from + * @param to + */ + protected static void transferDbRefs(SequenceI from, SequenceI to) + { + String cdsAccId = FeatureProperties.getCodingFeature(DBRefSource.EMBL); + DBRefEntry[] cdsRefs = DBRefUtils.selectRefs(from.getDBRef(), + DBRefSource.CODINGDBS); + if (cdsRefs != null) + { + for (DBRefEntry cdsRef : cdsRefs) + { + to.addDBRef(new DBRefEntry(cdsRef)); + cdsAccId = cdsRef.getAccessionId(); + } + } + if (!to.getName().contains(cdsAccId)) + { + to.setName(to.getName() + "|" + cdsAccId); } - return exonSequences; } }