X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fanalysis%2FAlignmentUtils.java;h=d8cb9a2229159c3fbe01193d69811edd775ae3dc;hb=3e115aff96aa9c7bf5d1c393e8e89fb4367c23ab;hp=1e259c29d07b7a8a43fff44ed8f3f89692b74cbf;hpb=4099019578bbeb81b76154922851ada38fb0017d;p=jalview.git diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index 1e259c2..d8cb9a2 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -31,6 +31,7 @@ import jalview.datamodel.FeatureProperties; import jalview.datamodel.Mapping; import jalview.datamodel.SearchResults; import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceGroup; import jalview.datamodel.SequenceI; import jalview.schemes.ResidueProperties; @@ -45,7 +46,6 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; -import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -228,9 +228,8 @@ public class AlignmentUtils * @param cdnaAlignment * @return */ - public static boolean mapProteinToCdna( - final AlignmentI proteinAlignment, - final AlignmentI cdnaAlignment) + public static boolean mapProteinAlignmentToCdna( + final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment) { if (proteinAlignment == null || cdnaAlignment == null) { @@ -276,7 +275,7 @@ public class AlignmentUtils final AlignmentI cdnaAlignment, Set mappedDna, Set mappedProtein, boolean xrefsOnly) { - boolean mappingPerformed = false; + boolean mappingExistsOrAdded = false; List thisSeqs = proteinAlignment.getSequences(); for (SequenceI aaSeq : thisSeqs) { @@ -309,14 +308,18 @@ public class AlignmentUtils { continue; } - if (!mappingExists(proteinAlignment.getCodonFrames(), + if (mappingExists(proteinAlignment.getCodonFrames(), aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence())) { - MapList map = mapProteinToCdna(aaSeq, cdnaSeq); + mappingExistsOrAdded = true; + } + else + { + MapList map = mapProteinSequenceToCdna(aaSeq, cdnaSeq); if (map != null) { acf.addMap(cdnaSeq, aaSeq, map); - mappingPerformed = true; + mappingExistsOrAdded = true; proteinMapped = true; mappedDna.add(cdnaSeq); mappedProtein.add(aaSeq); @@ -328,19 +331,19 @@ public class AlignmentUtils proteinAlignment.addCodonFrame(acf); } } - return mappingPerformed; + return mappingExistsOrAdded; } /** * Answers true if the mappings include one between the given (dataset) * sequences. */ - public static boolean mappingExists(Set set, + public static boolean mappingExists(List mappings, SequenceI aaSeq, SequenceI cdnaSeq) { - if (set != null) + if (mappings != null) { - for (AlignedCodonFrame acf : set) + for (AlignedCodonFrame acf : mappings) { if (cdnaSeq == acf.getDnaForAaSeq(aaSeq)) { @@ -361,7 +364,7 @@ public class AlignmentUtils * @param cdnaSeq * @return */ - public static MapList mapProteinToCdna(SequenceI proteinSeq, + public static MapList mapProteinSequenceToCdna(SequenceI proteinSeq, SequenceI cdnaSeq) { /* @@ -385,10 +388,10 @@ public class AlignmentUtils */ final int mappedLength = 3 * aaSeqChars.length; int cdnaLength = cdnaSeqChars.length; - int cdnaStart = 1; - int cdnaEnd = cdnaLength; - final int proteinStart = 1; - final int proteinEnd = aaSeqChars.length; + int cdnaStart = cdnaSeq.getStart(); + int cdnaEnd = cdnaSeq.getEnd(); + final int proteinStart = proteinSeq.getStart(); + final int proteinEnd = proteinSeq.getEnd(); /* * If lengths don't match, try ignoring stop codon. @@ -411,12 +414,13 @@ public class AlignmentUtils /* * If lengths still don't match, try ignoring start codon. */ + int startOffset = 0; if (cdnaLength != mappedLength && cdnaLength > 2 && String.valueOf(cdnaSeqChars, 0, 3).toUpperCase() - .equals( - ResidueProperties.START)) + .equals(ResidueProperties.START)) { + startOffset += 3; cdnaStart += 3; cdnaLength -= 3; } @@ -425,13 +429,12 @@ public class AlignmentUtils { return null; } - if (!translatesAs(cdnaSeqChars, cdnaStart - 1, aaSeqChars)) + if (!translatesAs(cdnaSeqChars, startOffset, aaSeqChars)) { return null; } - MapList map = new MapList(new int[] - { cdnaStart, cdnaEnd }, new int[] - { proteinStart, proteinEnd }, 3, 1); + MapList map = new MapList(new int[] { cdnaStart, cdnaEnd }, new int[] { + proteinStart, proteinEnd }, 3, 1); return map; } @@ -458,8 +461,7 @@ public class AlignmentUtils && aaResidue < aaSeqChars.length; i += 3, aaResidue++) { String codon = String.valueOf(cdnaSeqChars, i, 3); - final String translated = ResidueProperties.codonTranslate( - codon); + final String translated = ResidueProperties.codonTranslate(codon); /* * allow * in protein to match untranslatable in dna */ @@ -468,8 +470,7 @@ public class AlignmentUtils { continue; } - if (translated == null - || !(aaRes == translated.charAt(0))) + if (translated == null || !(aaRes == translated.charAt(0))) { // debug // System.out.println(("Mismatch at " + i + "/" + aaResidue + ": " @@ -500,7 +501,8 @@ public class AlignmentUtils boolean preserveUnmappedGaps) { /* - * Get any mappings from the source alignment to the target (dataset) sequence. + * Get any mappings from the source alignment to the target (dataset) + * sequence. */ // TODO there may be one AlignedCodonFrame per dataset sequence, or one with // all mappings. Would it help to constrain this? @@ -509,11 +511,11 @@ public class AlignmentUtils { return false; } - + /* * Locate the aligned source sequence whose dataset sequence is mapped. We - * just take the first match here (as we can't align cDNA like more than one - * protein sequence). + * just take the first match here (as we can't align like more than one + * sequence). */ SequenceI alignFrom = null; AlignedCodonFrame mapping = null; @@ -526,7 +528,7 @@ public class AlignmentUtils break; } } - + if (alignFrom == null) { return false; @@ -539,8 +541,8 @@ public class AlignmentUtils /** * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to * match residues and codons. Flags control whether existing gaps in unmapped - * (intron) and mapped (exon) regions are preserved or not. Gaps linking intro - * and exon are only retained if both flags are set. + * (intron) and mapped (exon) regions are preserved or not. Gaps between + * intron and exon are only retained if both flags are set. * * @param alignTo * @param alignFrom @@ -551,15 +553,12 @@ public class AlignmentUtils * @param preserveMappedGaps */ public static void alignSequenceAs(SequenceI alignTo, - SequenceI alignFrom, - AlignedCodonFrame mapping, String myGap, char sourceGap, - boolean preserveMappedGaps, boolean preserveUnmappedGaps) + SequenceI alignFrom, AlignedCodonFrame mapping, String myGap, + char sourceGap, boolean preserveMappedGaps, + boolean preserveUnmappedGaps) { // TODO generalise to work for Protein-Protein, dna-dna, dna-protein - final char[] thisSeq = alignTo.getSequence(); - final char[] thatAligned = alignFrom.getSequence(); - StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length); - + // aligned and dataset sequence positions, all base zero int thisSeqPos = 0; int sourceDsPos = 0; @@ -568,11 +567,17 @@ public class AlignmentUtils char myGapChar = myGap.charAt(0); int ratio = myGap.length(); - /* - * Traverse the aligned protein sequence. - */ + int fromOffset = alignFrom.getStart() - 1; + int toOffset = alignTo.getStart() - 1; int sourceGapMappedLength = 0; boolean inExon = false; + final char[] thisSeq = alignTo.getSequence(); + final char[] thatAligned = alignFrom.getSequence(); + StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length); + + /* + * Traverse the 'model' aligned sequence + */ for (char sourceChar : thatAligned) { if (sourceChar == sourceGap) @@ -582,20 +587,22 @@ public class AlignmentUtils } /* - * Found a residue. Locate its mapped codon (start) position. + * Found a non-gap character. Locate its mapped region if any. */ sourceDsPos++; // Note mapping positions are base 1, our sequence positions base 0 int[] mappedPos = mapping.getMappedRegion(alignTo, alignFrom, - sourceDsPos); + sourceDsPos + fromOffset); if (mappedPos == null) { /* - * Abort realignment if unmapped protein. Or could ignore it?? + * unmapped position; treat like a gap */ - System.err.println("Can't align: no codon mapping to residue " - + sourceDsPos + "(" + sourceChar + ")"); - return; + sourceGapMappedLength += ratio; + // System.err.println("Can't align: no codon mapping to residue " + // + sourceDsPos + "(" + sourceChar + ")"); + // return; + continue; } int mappedCodonStart = mappedPos[0]; // position (1...) of codon start @@ -611,14 +618,15 @@ public class AlignmentUtils * But then 'align dna as protein' doesn't make much sense otherwise. */ int intronLength = 0; - while (basesWritten < mappedCodonEnd && thisSeqPos < thisSeq.length) + while (basesWritten + toOffset < mappedCodonEnd + && thisSeqPos < thisSeq.length) { final char c = thisSeq[thisSeqPos++]; if (c != myGapChar) { basesWritten++; - - if (basesWritten < mappedCodonStart) + int sourcePosition = basesWritten + toOffset; + if (sourcePosition < mappedCodonStart) { /* * Found an unmapped (intron) base. First add in any preceding gaps @@ -635,7 +643,7 @@ public class AlignmentUtils } else { - final boolean startOfCodon = basesWritten == mappedCodonStart; + final boolean startOfCodon = sourcePosition == mappedCodonStart; int gapsToAdd = calculateGapsToInsert(preserveMappedGaps, preserveUnmappedGaps, sourceGapMappedLength, inExon, trailingCopiedGap.length(), intronLength, startOfCodon); @@ -664,8 +672,8 @@ public class AlignmentUtils } /* - * At end of protein sequence. Copy any remaining dna sequence, optionally - * including (intron) gaps. We do not copy trailing gaps in protein. + * At end of model aligned sequence. Copy any remaining target sequence, optionally + * including (intron) gaps. */ while (thisSeqPos < thisSeq.length) { @@ -674,6 +682,20 @@ public class AlignmentUtils { thisAligned.append(c); } + sourceGapMappedLength--; + } + + /* + * finally add gaps to pad for any trailing source gaps or + * unmapped characters + */ + if (preserveUnmappedGaps) + { + while (sourceGapMappedLength > 0) + { + thisAligned.append(myGapChar); + sourceGapMappedLength--; + } } /* @@ -696,8 +718,8 @@ public class AlignmentUtils */ protected static int calculateGapsToInsert(boolean preserveMappedGaps, boolean preserveUnmappedGaps, int sourceGapMappedLength, - boolean inExon, int trailingGapLength, - int intronLength, final boolean startOfCodon) + boolean inExon, int trailingGapLength, int intronLength, + final boolean startOfCodon) { int gapsToAdd = 0; if (startOfCodon) @@ -821,8 +843,8 @@ public class AlignmentUtils // mapping is from protein to nucleotide toDna = true; // should ideally get gap count ratio from mapping - gap = String.valueOf(new char[] - { gapCharacter, gapCharacter, gapCharacter }); + gap = String.valueOf(new char[] { gapCharacter, gapCharacter, + gapCharacter }); } else { @@ -901,7 +923,10 @@ public class AlignmentUtils */ public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna) { - Set mappings = protein.getCodonFrames(); + List unmappedProtein = new ArrayList(); + unmappedProtein.addAll(protein.getSequences()); + + List mappings = protein.getCodonFrames(); /* * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of @@ -921,10 +946,11 @@ public class AlignmentUtils { addCodonPositions(dnaSeq, prot, protein.getGapCharacter(), seqMap, alignedCodons); + unmappedProtein.remove(prot); } } } - return alignProteinAs(protein, alignedCodons); + return alignProteinAs(protein, alignedCodons, unmappedProtein); } /** @@ -935,10 +961,12 @@ public class AlignmentUtils * @param alignedCodons * an ordered map of codon positions (columns), with sequence/peptide * values present in each column + * @param unmappedProtein * @return */ protected static int alignProteinAs(AlignmentI protein, - Map> alignedCodons) + Map> alignedCodons, + List unmappedProtein) { /* * Prefill aligned sequences with gaps before inserting aligned protein @@ -950,15 +978,18 @@ public class AlignmentUtils String allGaps = String.valueOf(gaps); for (SequenceI seq : protein.getSequences()) { - seq.setSequence(allGaps); + if (!unmappedProtein.contains(seq)) + { + seq.setSequence(allGaps); + } } int column = 0; for (AlignedCodon codon : alignedCodons.keySet()) { - final Map columnResidues = alignedCodons.get(codon); - for (Entry entry : columnResidues - .entrySet()) + final Map columnResidues = alignedCodons + .get(codon); + for (Entry entry : columnResidues.entrySet()) { // place translated codon at its column position in sequence entry.getKey().getSequence()[column] = entry.getValue().charAt(0); @@ -985,8 +1016,7 @@ public class AlignmentUtils * the map we are building up */ static void addCodonPositions(SequenceI dna, SequenceI protein, - char gapChar, - Mapping seqMap, + char gapChar, Mapping seqMap, Map> alignedCodons) { Iterator codons = seqMap.getCodonIterator(dna, gapChar); @@ -1035,7 +1065,7 @@ public class AlignmentUtils } AlignmentI dna = al1.isNucleotide() ? al1 : al2; AlignmentI protein = dna == al1 ? al2 : al1; - Set mappings = protein.getCodonFrames(); + List mappings = protein.getCodonFrames(); for (SequenceI dnaSeq : dna.getSequences()) { for (SequenceI proteinSeq : protein.getSequences()) @@ -1059,23 +1089,25 @@ public class AlignmentUtils * @return */ protected static boolean isMappable(SequenceI dnaSeq, - SequenceI proteinSeq, - Set mappings) + SequenceI proteinSeq, List mappings) { if (dnaSeq == null || proteinSeq == null) { return false; } - SequenceI dnaDs = dnaSeq.getDatasetSequence() == null ? dnaSeq : dnaSeq.getDatasetSequence(); + SequenceI dnaDs = dnaSeq.getDatasetSequence() == null ? dnaSeq : dnaSeq + .getDatasetSequence(); SequenceI proteinDs = proteinSeq.getDatasetSequence() == null ? proteinSeq : proteinSeq.getDatasetSequence(); - - /* - * Already mapped? - */ - for (AlignedCodonFrame mapping : mappings) { - if ( proteinDs == mapping.getAaForDnaSeq(dnaDs)) { + + for (AlignedCodonFrame mapping : mappings) + { + if (proteinDs == mapping.getAaForDnaSeq(dnaDs)) + { + /* + * already mapped + */ return true; } } @@ -1084,7 +1116,7 @@ public class AlignmentUtils * Just try to make a mapping (it is not yet stored), test whether * successful. */ - return mapProteinToCdna(proteinDs, dnaDs) != null; + return mapProteinSequenceToCdna(proteinDs, dnaDs) != null; } /** @@ -1103,7 +1135,8 @@ public class AlignmentUtils * the alignment to check for presence of annotations */ public static void findAddableReferenceAnnotations( - List sequenceScope, Map labelForCalcId, + List sequenceScope, + Map labelForCalcId, final Map> candidates, AlignmentI al) { @@ -1111,7 +1144,7 @@ public class AlignmentUtils { return; } - + /* * For each sequence in scope, make a list of any annotations on the * underlying dataset sequence which are not already on the alignment. @@ -1139,8 +1172,7 @@ public class AlignmentUtils * sequence. */ final Iterable matchedAlignmentAnnotations = al - .findAnnotations(seq, dsann.getCalcId(), - dsann.label); + .findAnnotations(seq, dsann.getCalcId(), dsann.label); if (!matchedAlignmentAnnotations.iterator().hasNext()) { result.add(dsann); @@ -1188,7 +1220,7 @@ public class AlignmentUtils endRes = selectionGroup.getEndRes(); } copyAnn.restrict(startRes, endRes); - + /* * Add to the sequence (sets copyAnn.datasetSequence), unless the * original annotation is already on the sequence. @@ -1226,8 +1258,7 @@ public class AlignmentUtils Collection types, List forSequences, boolean anyType, boolean doShow) { - for (AlignmentAnnotation aa : al - .getAlignmentAnnotation()) + for (AlignmentAnnotation aa : al.getAlignmentAnnotation()) { if (anyType || types.contains(aa.label)) { @@ -1287,22 +1318,22 @@ public class AlignmentUtils } /** - * Constructs an alignment consisting of the mapped exon regions in the given + * Constructs an alignment consisting of the mapped cds regions in the given * nucleotide sequences, and updates mappings to match. * * @param dna * aligned dna sequences * @param mappings * from dna to protein; these are replaced with new mappings - * @return an alignment whose sequences are the exon-only parts of the dna - * sequences (or null if no exons are found) + * @return an alignment whose sequences are the cds-only parts of the dna + * sequences (or null if no cds are found) */ - public static AlignmentI makeExonAlignment(SequenceI[] dna, - Set mappings) + public static AlignmentI makeCdsAlignment(SequenceI[] dna, + List mappings) { - Set newMappings = new LinkedHashSet(); - List exonSequences = new ArrayList(); - + List newMappings = new ArrayList(); + List cdsSequences = new ArrayList(); + for (SequenceI dnaSeq : dna) { final SequenceI ds = dnaSeq.getDatasetSequence(); @@ -1311,17 +1342,17 @@ public class AlignmentUtils for (AlignedCodonFrame acf : seqMappings) { AlignedCodonFrame newMapping = new AlignedCodonFrame(); - final List mappedExons = makeExonSequences(ds, acf, + final List mappedCds = makeCdsSequences(ds, acf, newMapping); - if (!mappedExons.isEmpty()) + if (!mappedCds.isEmpty()) { - exonSequences.addAll(mappedExons); + cdsSequences.addAll(mappedCds); newMappings.add(newMapping); } } } AlignmentI al = new Alignment( - exonSequences.toArray(new SequenceI[exonSequences.size()])); + cdsSequences.toArray(new SequenceI[cdsSequences.size()])); al.setDataset(null); /* @@ -1334,88 +1365,203 @@ public class AlignmentUtils } /** - * Helper method to make exon-only sequences and populate their mappings to + * Helper method to make cds-only sequences and populate their mappings to * protein products *

* For example, if ggCCaTTcGAg has mappings [3, 4, 6, 7, 9, 10] to protein * then generate a sequence CCTTGA with mapping [1, 6] to the same protein * residues *

- * Typically eukaryotic dna will include exons encoding for a single peptide + * Typically eukaryotic dna will include cds encoding for a single peptide * sequence i.e. return a single result. Bacterial dna may have overlapping - * exon mappings coding for multiple peptides so return multiple results + * cds mappings coding for multiple peptides so return multiple results * (example EMBL KF591215). * * @param dnaSeq * a dna dataset sequence * @param mapping * containing one or more mappings of the sequence to protein - * @param newMapping - * the new mapping to populate, from the exon-only sequences to their + * @param newMappings + * the new mapping to populate, from the cds-only sequences to their * mapped protein sequences * @return */ - protected static List makeExonSequences(SequenceI dnaSeq, - AlignedCodonFrame mapping, AlignedCodonFrame newMapping) + protected static List makeCdsSequences(SequenceI dnaSeq, + AlignedCodonFrame mapping, AlignedCodonFrame newMappings) { - List exonSequences = new ArrayList(); + List cdsSequences = new ArrayList(); List seqMappings = mapping.getMappingsForSequence(dnaSeq); - final char[] dna = dnaSeq.getSequence(); + for (Mapping seqMapping : seqMappings) { - StringBuilder newSequence = new StringBuilder(dnaSeq.getLength()); + SequenceI cds = makeCdsSequence(dnaSeq, seqMapping, newMappings); + cdsSequences.add(cds); + + /* + * add new mappings, from dna to cds, and from cds to peptide + */ + MapList dnaToCds = addCdsMappings(dnaSeq, cds, seqMapping, + newMappings); /* - * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc } + * transfer any features on dna that overlap the CDS */ - final List dnaExonRanges = seqMapping.getMap().getFromRanges(); - for (int[] range : dnaExonRanges) + transferFeatures(dnaSeq, cds, dnaToCds, "CDS" /* SequenceOntology.CDS */); + } + return cdsSequences; + } + + /** + * Transfers any co-located features on 'fromSeq' to 'toSeq', adjusting the + * feature start/end ranges, optionally omitting specified feature types. + * + * @param fromSeq + * @param toSeq + * @param mapping + * the mapping from 'fromSeq' to 'toSeq' + * @param omitting + */ + protected static void transferFeatures(SequenceI fromSeq, + SequenceI toSeq, MapList mapping, String... omitting) + { + SequenceFeature[] sfs = fromSeq.getSequenceFeatures(); + if (sfs != null) + { + for (SequenceFeature sf : sfs) { - for (int pos = range[0]; pos <= range[1]; pos++) + String type = sf.getType(); + boolean omit = false; + for (String toOmit : omitting) + { + if (type.equals(toOmit)) + { + omit = true; + } + } + if (omit) { - newSequence.append(dna[pos - 1]); + continue; + } + + /* + * locate the mapped range - null if either start or end is + * not mapped (no partial overlaps are calculated) + */ + int[] mappedTo = mapping.locateInTo(sf.getBegin(), sf.getEnd()); + if (mappedTo != null) + { + SequenceFeature copy = new SequenceFeature(sf); + copy.setBegin(Math.min(mappedTo[0], mappedTo[1])); + copy.setEnd(Math.max(mappedTo[0], mappedTo[1])); + toSeq.addSequenceFeature(copy); } } + } + } + + /** + * Creates and adds mappings + *

    + *
  • from cds to peptide
  • + *
  • from dna to cds
  • + *
+ * and returns the dna-to-cds mapping + * + * @param dnaSeq + * @param cdsSeq + * @param dnaMapping + * @param newMappings + * @return + */ + protected static MapList addCdsMappings(SequenceI dnaSeq, + SequenceI cdsSeq, + Mapping dnaMapping, AlignedCodonFrame newMappings) + { + cdsSeq.createDatasetSequence(); - SequenceI exon = new Sequence(dnaSeq.getName(), - newSequence.toString()); + /* + * CDS to peptide is just a contiguous 3:1 mapping, with + * the peptide ranges taken unchanged from the dna mapping + */ + List cdsRanges = new ArrayList(); + cdsRanges.add(new int[] { 1, cdsSeq.getLength() }); + MapList cdsToPeptide = new MapList(cdsRanges, dnaMapping.getMap() + .getToRanges(), 3, 1); + newMappings.addMap(cdsSeq.getDatasetSequence(), dnaMapping.getTo(), + cdsToPeptide); - /* - * Locate any xrefs to CDS database on the protein product and attach to - * the CDS sequence. Also add as a sub-token of the sequence name. - */ - // default to "CDS" if we can't locate an actual gene id - String cdsAccId = FeatureProperties - .getCodingFeature(DBRefSource.EMBL); - DBRefEntry[] cdsRefs = DBRefUtils.selectRefs(seqMapping.getTo() - .getDBRef(), DBRefSource.CODINGDBS); - if (cdsRefs != null) + /* + * dna 'from' ranges map 1:1 to the contiguous extracted CDS + */ + MapList dnaToCds = new MapList( + dnaMapping.getMap().getFromRanges(), cdsRanges, 1, 1); + newMappings.addMap(dnaSeq, cdsSeq.getDatasetSequence(), dnaToCds); + return dnaToCds; + } + + /** + * Makes and returns a CDS-only sequence, where the CDS regions are identified + * as the 'from' ranges of the mapping on the dna. Any sequence features on + * the dna which overlap the CDS regions are copied to the new sequence. + * + * @param dnaSeq + * nucleotide sequence + * @param seqMapping + * mappings from CDS regions of nucleotide + * @param exonMappings + * CDS-to-peptide mapping (to add to) + * @return + */ + protected static SequenceI makeCdsSequence(SequenceI dnaSeq, + Mapping seqMapping, AlignedCodonFrame exonMappings) + { + StringBuilder newSequence = new StringBuilder(dnaSeq.getLength()); + final char[] dna = dnaSeq.getSequence(); + int offset = dnaSeq.getStart() - 1; + + /* + * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc } + */ + final List dnaCdsRanges = seqMapping.getMap().getFromRanges(); + for (int[] range : dnaCdsRanges) + { + // TODO handle reverse mapping as well (range[1] < range[0]) + for (int pos = range[0]; pos <= range[1]; pos++) { - for (DBRefEntry cdsRef : cdsRefs) - { - exon.addDBRef(new DBRefEntry(cdsRef)); - cdsAccId = cdsRef.getAccessionId(); - } + newSequence.append(dna[pos - offset - 1]); } - exon.setName(exon.getName() + "|" + cdsAccId); - exon.createDatasetSequence(); + } - /* - * Build new mappings - from the same protein regions, but now to - * contiguous exons - */ - List exonRange = new ArrayList(); - exonRange.add(new int[] - { 1, newSequence.length() }); - MapList map = new MapList(exonRange, seqMapping.getMap() - .getToRanges(), - 3, 1); - newMapping.addMap(exon.getDatasetSequence(), seqMapping.getTo(), map); - MapList cdsToDnaMap = new MapList(dnaExonRanges, exonRange, 1, 1); - newMapping.addMap(dnaSeq, exon.getDatasetSequence(), cdsToDnaMap); - - exonSequences.add(exon); + SequenceI cds = new Sequence(dnaSeq.getName(), + newSequence.toString()); + + transferDbRefs(seqMapping.getTo(), cds); + return cds; + } + + /** + * Locate any xrefs to CDS databases on the protein product and attach to the + * CDS sequence. Also add as a sub-token of the sequence name. + * + * @param from + * @param to + */ + protected static void transferDbRefs(SequenceI from, SequenceI to) + { + String cdsAccId = FeatureProperties.getCodingFeature(DBRefSource.EMBL); + DBRefEntry[] cdsRefs = DBRefUtils.selectRefs(from.getDBRef(), + DBRefSource.CODINGDBS); + if (cdsRefs != null) + { + for (DBRefEntry cdsRef : cdsRefs) + { + to.addDBRef(new DBRefEntry(cdsRef)); + cdsAccId = cdsRef.getAccessionId(); + } + } + if (!to.getName().contains(cdsAccId)) + { + to.setName(to.getName() + "|" + cdsAccId); } - return exonSequences; } }