+ .findMappingsForSequence(dnaSeq, mappings);
+ for (AlignedCodonFrame mapping : seqMappings)
+ {
+ List<Mapping> mappingsFromSequence = mapping
+ .getMappingsFromSequence(dnaSeq);
+
+ for (Mapping aMapping : mappingsFromSequence)
+ {
+ MapList mapList = aMapping.getMap();
+ if (mapList.getFromRatio() == 1)
+ {
+ /*
+ * not a dna-to-protein mapping (likely dna-to-cds)
+ */
+ continue;
+ }
+
+ /*
+ * skip if mapping is not to one of the target set of proteins
+ */
+ SequenceI proteinProduct = aMapping.getTo();
+ if (productSeqs != null && !productSeqs.contains(proteinProduct))
+ {
+ continue;
+ }
+
+ /*
+ * try to locate the CDS from the dataset mappings;
+ * guard against duplicate results (for the case that protein has
+ * dbrefs to both dna and cds sequences)
+ */
+ SequenceI cdsSeq = findCdsForProtein(mappings, dnaSeq,
+ seqMappings, aMapping);
+ if (cdsSeq != null)
+ {
+ if (!foundSeqs.contains(cdsSeq))
+ {
+ foundSeqs.add(cdsSeq);
+ SequenceI derivedSequence = cdsSeq.deriveSequence();
+ cdsSeqs.add(derivedSequence);
+ if (!dataset.getSequences().contains(cdsSeq))
+ {
+ dataset.addSequence(cdsSeq);
+ }
+ }
+ continue;
+ }
+
+ /*
+ * didn't find mapped CDS sequence - construct it and add
+ * its dataset sequence to the dataset
+ */
+ cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping,
+ dataset).deriveSequence();
+ // cdsSeq has a name constructed as CDS|<dbref>
+ // <dbref> will be either the accession for the coding sequence,
+ // marked in the /via/ dbref to the protein product accession
+ // or it will be the original nucleotide accession.
+ SequenceI cdsSeqDss = cdsSeq.getDatasetSequence();
+
+ cdsSeqs.add(cdsSeq);
+
+ /*
+ * build the mapping from CDS to protein
+ */
+ List<int[]> cdsRange = Collections
+ .singletonList(new int[]
+ { cdsSeq.getStart(),
+ cdsSeq.getLength() + cdsSeq.getStart() - 1 });
+ MapList cdsToProteinMap = new MapList(cdsRange,
+ mapList.getToRanges(), mapList.getFromRatio(),
+ mapList.getToRatio());
+
+ if (!dataset.getSequences().contains(cdsSeqDss))
+ {
+ /*
+ * if this sequence is a newly created one, add it to the dataset
+ * and made a CDS to protein mapping (if sequence already exists,
+ * CDS-to-protein mapping _is_ the transcript-to-protein mapping)
+ */
+ dataset.addSequence(cdsSeqDss);
+ AlignedCodonFrame cdsToProteinMapping = new AlignedCodonFrame();
+ cdsToProteinMapping.addMap(cdsSeqDss, proteinProduct,
+ cdsToProteinMap);
+
+ /*
+ * guard against duplicating the mapping if repeating this action
+ */
+ if (!mappings.contains(cdsToProteinMapping))
+ {
+ mappings.add(cdsToProteinMapping);
+ }
+ }
+
+ propagateDBRefsToCDS(cdsSeqDss, dnaSeq.getDatasetSequence(),
+ proteinProduct, aMapping);
+ /*
+ * add another mapping from original 'from' range to CDS
+ */
+ AlignedCodonFrame dnaToCdsMapping = new AlignedCodonFrame();
+ final MapList dnaToCdsMap = new MapList(mapList.getFromRanges(),
+ cdsRange, 1, 1);
+ dnaToCdsMapping.addMap(dnaSeq.getDatasetSequence(), cdsSeqDss,
+ dnaToCdsMap);
+ if (!mappings.contains(dnaToCdsMapping))
+ {
+ mappings.add(dnaToCdsMapping);
+ }
+
+ /*
+ * transfer dna chromosomal loci (if known) to the CDS
+ * sequence (via the mapping)
+ */
+ final MapList cdsToDnaMap = dnaToCdsMap.getInverse();
+ transferGeneLoci(dnaSeq, cdsToDnaMap, cdsSeq);
+
+ /*
+ * add DBRef with mapping from protein to CDS
+ * (this enables Get Cross-References from protein alignment)
+ * This is tricky because we can't have two DBRefs with the
+ * same source and accession, so need a different accession for
+ * the CDS from the dna sequence
+ */
+
+ // specific use case:
+ // Genomic contig ENSCHR:1, contains coding regions for ENSG01,
+ // ENSG02, ENSG03, with transcripts and products similarly named.
+ // cannot add distinct dbrefs mapping location on ENSCHR:1 to ENSG01
+
+ // JBPNote: ?? can't actually create an example that demonstrates we
+ // need to
+ // synthesize an xref.
+
+ List<DBRefEntry> primrefs = dnaDss.getPrimaryDBRefs();
+ for (int ip = 0, np = primrefs.size(); ip < np; ip++)
+ {
+ DBRefEntry primRef = primrefs.get(ip);
+ /*
+ * create a cross-reference from CDS to the source sequence's
+ * primary reference and vice versa
+ */
+ String source = primRef.getSource();
+ String version = primRef.getVersion();
+ DBRefEntry cdsCrossRef = new DBRefEntry(source,
+ source + ":" + version, primRef.getAccessionId());
+ cdsCrossRef
+ .setMap(new Mapping(dnaDss, new MapList(cdsToDnaMap)));
+ cdsSeqDss.addDBRef(cdsCrossRef);
+
+ dnaSeq.addDBRef(new DBRefEntry(source, version,
+ cdsSeq.getName(), new Mapping(cdsSeqDss, dnaToCdsMap)));
+ // problem here is that the cross-reference is synthesized -
+ // cdsSeq.getName() may be like 'CDS|dnaaccession' or
+ // 'CDS|emblcdsacc'
+ // assuming cds version same as dna ?!?
+
+ DBRefEntry proteinToCdsRef = new DBRefEntry(source, version,
+ cdsSeq.getName());
+ //
+ proteinToCdsRef.setMap(
+ new Mapping(cdsSeqDss, cdsToProteinMap.getInverse()));
+ proteinProduct.addDBRef(proteinToCdsRef);
+ }
+ /*
+ * transfer any features on dna that overlap the CDS
+ */
+ transferFeatures(dnaSeq, cdsSeq, dnaToCdsMap, null,
+ SequenceOntologyI.CDS);
+ }
+ }
+ }
+
+ AlignmentI cds = new Alignment(
+ cdsSeqs.toArray(new SequenceI[cdsSeqs.size()]));
+ cds.setDataset(dataset);
+
+ return cds;
+ }
+
+ /**
+ * Tries to transfer gene loci (dbref to chromosome positions) from fromSeq to
+ * toSeq, mediated by the given mapping between the sequences
+ *
+ * @param fromSeq
+ * @param targetToFrom
+ * Map
+ * @param targetSeq
+ */
+ protected static void transferGeneLoci(SequenceI fromSeq,
+ MapList targetToFrom, SequenceI targetSeq)
+ {
+ if (targetSeq.getGeneLoci() != null)
+ {
+ // already have - don't override
+ return;
+ }
+ GeneLociI fromLoci = fromSeq.getGeneLoci();
+ if (fromLoci == null)
+ {
+ return;
+ }
+
+ MapList newMap = targetToFrom.traverse(fromLoci.getMapping());
+
+ if (newMap != null)
+ {
+ targetSeq.setGeneLoci(fromLoci.getSpeciesId(),
+ fromLoci.getAssemblyId(), fromLoci.getChromosomeId(), newMap);
+ }
+ }
+
+ /**
+ * A helper method that finds a CDS sequence in the alignment dataset that is
+ * mapped to the given protein sequence, and either is, or has a mapping from,
+ * the given dna sequence.
+ *
+ * @param mappings
+ * set of all mappings on the dataset
+ * @param dnaSeq
+ * a dna (or cds) sequence we are searching from
+ * @param seqMappings
+ * the set of mappings involving dnaSeq
+ * @param aMapping
+ * a transcript-to-peptide mapping
+ * @return
+ */
+ static SequenceI findCdsForProtein(List<AlignedCodonFrame> mappings,
+ SequenceI dnaSeq, List<AlignedCodonFrame> seqMappings,
+ Mapping aMapping)
+ {
+ /*
+ * TODO a better dna-cds-protein mapping data representation to allow easy
+ * navigation; until then this clunky looping around lists of mappings
+ */
+ SequenceI seqDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
+ : dnaSeq.getDatasetSequence();
+ SequenceI proteinProduct = aMapping.getTo();
+
+ /*
+ * is this mapping from the whole dna sequence (i.e. CDS)?
+ * allowing for possible stop codon on dna but not peptide
+ */
+ int mappedFromLength = MappingUtils
+ .getLength(aMapping.getMap().getFromRanges());
+ int dnaLength = seqDss.getLength();
+ if (mappedFromLength == dnaLength
+ || mappedFromLength == dnaLength - CODON_LENGTH)
+ {
+ /*
+ * if sequence has CDS features, this is a transcript with no UTR
+ * - do not take this as the CDS sequence! (JAL-2789)
+ */
+ if (seqDss.getFeatures().getFeaturesByOntology(SequenceOntologyI.CDS)
+ .isEmpty())
+ {
+ return seqDss;
+ }
+ }
+
+ /*
+ * looks like we found the dna-to-protein mapping; search for the
+ * corresponding cds-to-protein mapping
+ */
+ List<AlignedCodonFrame> mappingsToPeptide = MappingUtils
+ .findMappingsForSequence(proteinProduct, mappings);
+ for (AlignedCodonFrame acf : mappingsToPeptide)
+ {
+ for (SequenceToSequenceMapping map : acf.getMappings())