+ }
+
+ /**
+ * A helper method that finds a CDS sequence in the alignment dataset that is
+ * mapped to the given protein sequence, and either is, or has a mapping from,
+ * the given dna sequence.
+ *
+ * @param mappings
+ * set of all mappings on the dataset
+ * @param dnaSeq
+ * a dna (or cds) sequence we are searching from
+ * @param seqMappings
+ * the set of mappings involving dnaSeq
+ * @param aMapping
+ * a transcript-to-peptide mapping
+ * @return
+ */
+ static SequenceI findCdsForProtein(List<AlignedCodonFrame> mappings,
+ SequenceI dnaSeq, List<AlignedCodonFrame> seqMappings,
+ Mapping aMapping)
+ {
+ /*
+ * TODO a better dna-cds-protein mapping data representation to allow easy
+ * navigation; until then this clunky looping around lists of mappings
+ */
+ SequenceI seqDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
+ : dnaSeq.getDatasetSequence();
+ SequenceI proteinProduct = aMapping.getTo();
+
+ /*
+ * is this mapping from the whole dna sequence (i.e. CDS)?
+ * allowing for possible stop codon on dna but not peptide
+ */
+ int mappedFromLength = MappingUtils
+ .getLength(aMapping.getMap().getFromRanges());
+ int dnaLength = seqDss.getLength();
+ if (mappedFromLength == dnaLength
+ || mappedFromLength == dnaLength - CODON_LENGTH)
+ {
+ /*
+ * if sequence has CDS features, this is a transcript with no UTR
+ * - do not take this as the CDS sequence! (JAL-2789)
+ */
+ if (seqDss.getFeatures().getFeaturesByOntology(SequenceOntologyI.CDS)
+ .isEmpty())
+ {
+ return seqDss;
+ }
+ }
+
+ /*
+ * looks like we found the dna-to-protein mapping; search for the
+ * corresponding cds-to-protein mapping
+ */
+ List<AlignedCodonFrame> mappingsToPeptide = MappingUtils
+ .findMappingsForSequence(proteinProduct, mappings);
+ for (AlignedCodonFrame acf : mappingsToPeptide)
+ {
+ for (SequenceToSequenceMapping map : acf.getMappings())
+ {
+ Mapping mapping = map.getMapping();
+ if (mapping != aMapping
+ && mapping.getMap().getFromRatio() == CODON_LENGTH
+ && proteinProduct == mapping.getTo()
+ && seqDss != map.getFromSeq())
+ {
+ mappedFromLength = MappingUtils
+ .getLength(mapping.getMap().getFromRanges());
+ if (mappedFromLength == map.getFromSeq().getLength())
+ {
+ /*
+ * found a 3:1 mapping to the protein product which covers
+ * the whole dna sequence i.e. is from CDS; finally check the CDS
+ * is mapped from the given dna start sequence
+ */
+ SequenceI cdsSeq = map.getFromSeq();
+ // todo this test is weak if seqMappings contains multiple mappings;
+ // we get away with it if transcript:cds relationship is 1:1
+ List<AlignedCodonFrame> dnaToCdsMaps = MappingUtils
+ .findMappingsForSequence(cdsSeq, seqMappings);
+ if (!dnaToCdsMaps.isEmpty())
+ {
+ return cdsSeq;
+ }
+ }
+ }
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Helper method that makes a CDS sequence as defined by the mappings from the
+ * given sequence i.e. extracts the 'mapped from' ranges (which may be on
+ * forward or reverse strand).
+ *
+ * @param seq
+ * @param mapping
+ * @param dataset
+ * - existing dataset. We check for sequences that look like the CDS
+ * we are about to construct, if one exists already, then we will
+ * just return that one.
+ * @return CDS sequence (as a dataset sequence)
+ */
+ static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping,
+ AlignmentI dataset)
+ {
+ /*
+ * construct CDS sequence name as "CDS|" with 'from id' held in the mapping
+ * if set (e.g. EMBL protein_id), else sequence name appended
+ */
+ String mapFromId = mapping.getMappedFromId();
+ final String seqId = "CDS|"
+ + (mapFromId != null ? mapFromId : seq.getName());
+
+ SequenceI newSeq = null;
+
+ /*
+ * construct CDS sequence by splicing mapped from ranges
+ */
+ char[] seqChars = seq.getSequence();
+ List<int[]> fromRanges = mapping.getMap().getFromRanges();
+ int cdsWidth = MappingUtils.getLength(fromRanges);
+ char[] newSeqChars = new char[cdsWidth];
+
+ int newPos = 0;
+ for (int[] range : fromRanges)
+ {
+ if (range[0] <= range[1])
+ {
+ // forward strand mapping - just copy the range
+ int length = range[1] - range[0] + 1;
+ System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos,
+ length);
+ newPos += length;
+ }
+ else
+ {
+ // reverse strand mapping - copy and complement one by one
+ for (int i = range[0]; i >= range[1]; i--)
+ {
+ newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]);
+ }
+ }
+
+ newSeq = new Sequence(seqId, newSeqChars, 1, newPos);
+ }
+
+ if (dataset != null)
+ {
+ SequenceI[] matches = dataset.findSequenceMatch(newSeq.getName());
+ if (matches != null)
+ {
+ boolean matched = false;
+ for (SequenceI mtch : matches)
+ {
+ if (mtch.getStart() != newSeq.getStart())
+ {
+ continue;
+ }
+ if (mtch.getEnd() != newSeq.getEnd())
+ {
+ continue;
+ }
+ if (!Arrays.equals(mtch.getSequence(), newSeq.getSequence()))
+ {
+ continue;
+ }
+ if (!matched)
+ {
+ matched = true;
+ newSeq = mtch;
+ }
+ else
+ {
+ Cache.log.error(
+ "JAL-2154 regression: warning - found (and ignored) a duplicate CDS sequence:" + mtch.toString());
+ }
+ }
+ }
+ }
+ // newSeq.setDescription(mapFromId);
+
+ return newSeq;
+ }
+
+ /**
+ * Adds any DBRefEntrys to cdsSeq from contig that have a Mapping congruent to
+ * the given mapping.
+ *
+ * @param cdsSeq
+ * @param contig
+ * @param proteinProduct
+ * @param mapping
+ * @return list of DBRefEntrys added
+ */
+ protected static List<DBRefEntry> propagateDBRefsToCDS(SequenceI cdsSeq,
+ SequenceI contig, SequenceI proteinProduct, Mapping mapping)
+ {
+
+ // gather direct refs from contig congruent with mapping
+ List<DBRefEntry> direct = new ArrayList<>();
+ HashSet<String> directSources = new HashSet<>();
+
+ if (contig.getDBRefs() != null)
+ {
+ for (DBRefEntry dbr : contig.getDBRefs())
+ {
+ if (dbr.hasMap() && dbr.getMap().getMap().isTripletMap())
+ {
+ MapList map = dbr.getMap().getMap();
+ // check if map is the CDS mapping
+ if (mapping.getMap().equals(map))
+ {
+ direct.add(dbr);
+ directSources.add(dbr.getSource());
+ }
+ }
+ }
+ }
+ DBRefEntry[] onSource = DBRefUtils.selectRefs(
+ proteinProduct.getDBRefs(),
+ directSources.toArray(new String[0]));
+ List<DBRefEntry> propagated = new ArrayList<>();
+
+ // and generate appropriate mappings
+ for (DBRefEntry cdsref : direct)
+ {
+ // clone maplist and mapping
+ MapList cdsposmap = new MapList(
+ Arrays.asList(new int[][]
+ { new int[] { cdsSeq.getStart(), cdsSeq.getEnd() } }),
+ cdsref.getMap().getMap().getToRanges(), 3, 1);
+ Mapping cdsmap = new Mapping(cdsref.getMap().getTo(),
+ cdsref.getMap().getMap());
+
+ // create dbref
+ DBRefEntry newref = new DBRefEntry(cdsref.getSource(),
+ cdsref.getVersion(), cdsref.getAccessionId(),
+ new Mapping(cdsmap.getTo(), cdsposmap));
+
+ // and see if we can map to the protein product for this mapping.
+ // onSource is the filtered set of accessions on protein that we are
+ // tranferring, so we assume accession is the same.
+ if (cdsmap.getTo() == null && onSource != null)
+ {
+ List<DBRefEntry> sourceRefs = DBRefUtils.searchRefs(onSource,
+ cdsref.getAccessionId());
+ if (sourceRefs != null)
+ {
+ for (DBRefEntry srcref : sourceRefs)
+ {
+ if (srcref.getSource().equalsIgnoreCase(cdsref.getSource()))
+ {
+ // we have found a complementary dbref on the protein product, so
+ // update mapping's getTo
+ newref.getMap().setTo(proteinProduct);
+ }
+ }
+ }
+ }
+ cdsSeq.addDBRef(newref);
+ propagated.add(newref);
+ }
+ return propagated;