X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fanalysis%2FAlignmentUtils.java;h=d8cb9a2229159c3fbe01193d69811edd775ae3dc;hb=3e115aff96aa9c7bf5d1c393e8e89fb4367c23ab;hp=0e30d8cbdabb1bfb7423f7692da7a52e56d43ae6;hpb=ae014811dc190f0d4532728fcffa58264d288b6b;p=jalview.git diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index 0e30d8c..d8cb9a2 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -31,6 +31,7 @@ import jalview.datamodel.FeatureProperties; import jalview.datamodel.Mapping; import jalview.datamodel.SearchResults; import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceGroup; import jalview.datamodel.SequenceI; import jalview.schemes.ResidueProperties; @@ -1317,21 +1318,21 @@ public class AlignmentUtils } /** - * Constructs an alignment consisting of the mapped exon regions in the given + * Constructs an alignment consisting of the mapped cds regions in the given * nucleotide sequences, and updates mappings to match. * * @param dna * aligned dna sequences * @param mappings * from dna to protein; these are replaced with new mappings - * @return an alignment whose sequences are the exon-only parts of the dna - * sequences (or null if no exons are found) + * @return an alignment whose sequences are the cds-only parts of the dna + * sequences (or null if no cds are found) */ - public static AlignmentI makeExonAlignment(SequenceI[] dna, + public static AlignmentI makeCdsAlignment(SequenceI[] dna, List mappings) { List newMappings = new ArrayList(); - List exonSequences = new ArrayList(); + List cdsSequences = new ArrayList(); for (SequenceI dnaSeq : dna) { @@ -1341,17 +1342,17 @@ public class AlignmentUtils for (AlignedCodonFrame acf : seqMappings) { AlignedCodonFrame newMapping = new AlignedCodonFrame(); - final List mappedExons = makeExonSequences(ds, acf, + final List mappedCds = makeCdsSequences(ds, acf, newMapping); - if (!mappedExons.isEmpty()) + if (!mappedCds.isEmpty()) { - exonSequences.addAll(mappedExons); + cdsSequences.addAll(mappedCds); newMappings.add(newMapping); } } } AlignmentI al = new Alignment( - exonSequences.toArray(new SequenceI[exonSequences.size()])); + cdsSequences.toArray(new SequenceI[cdsSequences.size()])); al.setDataset(null); /* @@ -1364,86 +1365,203 @@ public class AlignmentUtils } /** - * Helper method to make exon-only sequences and populate their mappings to + * Helper method to make cds-only sequences and populate their mappings to * protein products *

* For example, if ggCCaTTcGAg has mappings [3, 4, 6, 7, 9, 10] to protein * then generate a sequence CCTTGA with mapping [1, 6] to the same protein * residues *

- * Typically eukaryotic dna will include exons encoding for a single peptide + * Typically eukaryotic dna will include cds encoding for a single peptide * sequence i.e. return a single result. Bacterial dna may have overlapping - * exon mappings coding for multiple peptides so return multiple results + * cds mappings coding for multiple peptides so return multiple results * (example EMBL KF591215). * * @param dnaSeq * a dna dataset sequence * @param mapping * containing one or more mappings of the sequence to protein - * @param newMapping - * the new mapping to populate, from the exon-only sequences to their + * @param newMappings + * the new mapping to populate, from the cds-only sequences to their * mapped protein sequences * @return */ - protected static List makeExonSequences(SequenceI dnaSeq, - AlignedCodonFrame mapping, AlignedCodonFrame newMapping) + protected static List makeCdsSequences(SequenceI dnaSeq, + AlignedCodonFrame mapping, AlignedCodonFrame newMappings) { - List exonSequences = new ArrayList(); + List cdsSequences = new ArrayList(); List seqMappings = mapping.getMappingsForSequence(dnaSeq); - final char[] dna = dnaSeq.getSequence(); + for (Mapping seqMapping : seqMappings) { - StringBuilder newSequence = new StringBuilder(dnaSeq.getLength()); + SequenceI cds = makeCdsSequence(dnaSeq, seqMapping, newMappings); + cdsSequences.add(cds); + + /* + * add new mappings, from dna to cds, and from cds to peptide + */ + MapList dnaToCds = addCdsMappings(dnaSeq, cds, seqMapping, + newMappings); /* - * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc } + * transfer any features on dna that overlap the CDS */ - final List dnaExonRanges = seqMapping.getMap().getFromRanges(); - for (int[] range : dnaExonRanges) + transferFeatures(dnaSeq, cds, dnaToCds, "CDS" /* SequenceOntology.CDS */); + } + return cdsSequences; + } + + /** + * Transfers any co-located features on 'fromSeq' to 'toSeq', adjusting the + * feature start/end ranges, optionally omitting specified feature types. + * + * @param fromSeq + * @param toSeq + * @param mapping + * the mapping from 'fromSeq' to 'toSeq' + * @param omitting + */ + protected static void transferFeatures(SequenceI fromSeq, + SequenceI toSeq, MapList mapping, String... omitting) + { + SequenceFeature[] sfs = fromSeq.getSequenceFeatures(); + if (sfs != null) + { + for (SequenceFeature sf : sfs) { - for (int pos = range[0]; pos <= range[1]; pos++) + String type = sf.getType(); + boolean omit = false; + for (String toOmit : omitting) + { + if (type.equals(toOmit)) + { + omit = true; + } + } + if (omit) { - newSequence.append(dna[pos - 1]); + continue; + } + + /* + * locate the mapped range - null if either start or end is + * not mapped (no partial overlaps are calculated) + */ + int[] mappedTo = mapping.locateInTo(sf.getBegin(), sf.getEnd()); + if (mappedTo != null) + { + SequenceFeature copy = new SequenceFeature(sf); + copy.setBegin(Math.min(mappedTo[0], mappedTo[1])); + copy.setEnd(Math.max(mappedTo[0], mappedTo[1])); + toSeq.addSequenceFeature(copy); } } + } + } - SequenceI exon = new Sequence(dnaSeq.getName(), - newSequence.toString()); + /** + * Creates and adds mappings + *

    + *
  • from cds to peptide
  • + *
  • from dna to cds
  • + *
+ * and returns the dna-to-cds mapping + * + * @param dnaSeq + * @param cdsSeq + * @param dnaMapping + * @param newMappings + * @return + */ + protected static MapList addCdsMappings(SequenceI dnaSeq, + SequenceI cdsSeq, + Mapping dnaMapping, AlignedCodonFrame newMappings) + { + cdsSeq.createDatasetSequence(); - /* - * Locate any xrefs to CDS database on the protein product and attach to - * the CDS sequence. Also add as a sub-token of the sequence name. - */ - // default to "CDS" if we can't locate an actual gene id - String cdsAccId = FeatureProperties - .getCodingFeature(DBRefSource.EMBL); - DBRefEntry[] cdsRefs = DBRefUtils.selectRefs(seqMapping.getTo() - .getDBRef(), DBRefSource.CODINGDBS); - if (cdsRefs != null) + /* + * CDS to peptide is just a contiguous 3:1 mapping, with + * the peptide ranges taken unchanged from the dna mapping + */ + List cdsRanges = new ArrayList(); + cdsRanges.add(new int[] { 1, cdsSeq.getLength() }); + MapList cdsToPeptide = new MapList(cdsRanges, dnaMapping.getMap() + .getToRanges(), 3, 1); + newMappings.addMap(cdsSeq.getDatasetSequence(), dnaMapping.getTo(), + cdsToPeptide); + + /* + * dna 'from' ranges map 1:1 to the contiguous extracted CDS + */ + MapList dnaToCds = new MapList( + dnaMapping.getMap().getFromRanges(), cdsRanges, 1, 1); + newMappings.addMap(dnaSeq, cdsSeq.getDatasetSequence(), dnaToCds); + return dnaToCds; + } + + /** + * Makes and returns a CDS-only sequence, where the CDS regions are identified + * as the 'from' ranges of the mapping on the dna. Any sequence features on + * the dna which overlap the CDS regions are copied to the new sequence. + * + * @param dnaSeq + * nucleotide sequence + * @param seqMapping + * mappings from CDS regions of nucleotide + * @param exonMappings + * CDS-to-peptide mapping (to add to) + * @return + */ + protected static SequenceI makeCdsSequence(SequenceI dnaSeq, + Mapping seqMapping, AlignedCodonFrame exonMappings) + { + StringBuilder newSequence = new StringBuilder(dnaSeq.getLength()); + final char[] dna = dnaSeq.getSequence(); + int offset = dnaSeq.getStart() - 1; + + /* + * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc } + */ + final List dnaCdsRanges = seqMapping.getMap().getFromRanges(); + for (int[] range : dnaCdsRanges) + { + // TODO handle reverse mapping as well (range[1] < range[0]) + for (int pos = range[0]; pos <= range[1]; pos++) { - for (DBRefEntry cdsRef : cdsRefs) - { - exon.addDBRef(new DBRefEntry(cdsRef)); - cdsAccId = cdsRef.getAccessionId(); - } + newSequence.append(dna[pos - offset - 1]); } - exon.setName(exon.getName() + "|" + cdsAccId); - exon.createDatasetSequence(); + } - /* - * Build new mappings - from the same protein regions, but now to - * contiguous exons - */ - List exonRange = new ArrayList(); - exonRange.add(new int[] { 1, newSequence.length() }); - MapList map = new MapList(exonRange, seqMapping.getMap() - .getToRanges(), 3, 1); - newMapping.addMap(exon.getDatasetSequence(), seqMapping.getTo(), map); - MapList cdsToDnaMap = new MapList(dnaExonRanges, exonRange, 1, 1); - newMapping.addMap(dnaSeq, exon.getDatasetSequence(), cdsToDnaMap); - - exonSequences.add(exon); + SequenceI cds = new Sequence(dnaSeq.getName(), + newSequence.toString()); + + transferDbRefs(seqMapping.getTo(), cds); + return cds; + } + + /** + * Locate any xrefs to CDS databases on the protein product and attach to the + * CDS sequence. Also add as a sub-token of the sequence name. + * + * @param from + * @param to + */ + protected static void transferDbRefs(SequenceI from, SequenceI to) + { + String cdsAccId = FeatureProperties.getCodingFeature(DBRefSource.EMBL); + DBRefEntry[] cdsRefs = DBRefUtils.selectRefs(from.getDBRef(), + DBRefSource.CODINGDBS); + if (cdsRefs != null) + { + for (DBRefEntry cdsRef : cdsRefs) + { + to.addDBRef(new DBRefEntry(cdsRef)); + cdsAccId = cdsRef.getAccessionId(); + } + } + if (!to.getName().contains(cdsAccId)) + { + to.setName(to.getName() + "|" + cdsAccId); } - return exonSequences; } }