+ AlignmentI cds = new Alignment(cdsSeqs.toArray(new SequenceI[cdsSeqs
+ .size()]));
+ cds.setDataset(dataset);
+
+ return cds;
+ }
+
+ /**
+ * Tries to transfer gene loci (dbref to chromosome positions) from fromSeq to
+ * toSeq, mediated by the given mapping between the sequences
+ *
+ * @param fromSeq
+ * @param targetToFrom
+ * Map
+ * @param targetSeq
+ */
+ protected static void transferGeneLoci(SequenceI fromSeq,
+ MapList targetToFrom, SequenceI targetSeq)
+ {
+ if (targetSeq.getGeneLoci() != null)
+ {
+ // already have - don't override
+ return;
+ }
+ GeneLociI fromLoci = fromSeq.getGeneLoci();
+ if (fromLoci == null)
+ {
+ return;
+ }
+
+ MapList newMap = targetToFrom.traverse(fromLoci.getMapping());
+
+ if (newMap != null)
+ {
+ targetSeq.setGeneLoci(fromLoci.getSpeciesId(),
+ fromLoci.getAssemblyId(), fromLoci.getChromosomeId(), newMap);
+ }
+ }
+
+ /**
+ * A helper method that finds a CDS sequence in the alignment dataset that is
+ * mapped to the given protein sequence, and either is, or has a mapping from,
+ * the given dna sequence.
+ *
+ * @param mappings
+ * set of all mappings on the dataset
+ * @param dnaSeq
+ * a dna (or cds) sequence we are searching from
+ * @param seqMappings
+ * the set of mappings involving dnaSeq
+ * @param aMapping
+ * a transcript-to-peptide mapping
+ * @return
+ */
+ static SequenceI findCdsForProtein(List<AlignedCodonFrame> mappings,
+ SequenceI dnaSeq, List<AlignedCodonFrame> seqMappings,
+ Mapping aMapping)
+ {
+ /*
+ * TODO a better dna-cds-protein mapping data representation to allow easy
+ * navigation; until then this clunky looping around lists of mappings
+ */
+ SequenceI seqDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
+ : dnaSeq.getDatasetSequence();
+ SequenceI proteinProduct = aMapping.getTo();
+
+ /*
+ * is this mapping from the whole dna sequence (i.e. CDS)?
+ * allowing for possible stop codon on dna but not peptide
+ */
+ int mappedFromLength = MappingUtils
+ .getLength(aMapping.getMap().getFromRanges());
+ int dnaLength = seqDss.getLength();
+ if (mappedFromLength == dnaLength
+ || mappedFromLength == dnaLength - CODON_LENGTH)
+ {
+ /*
+ * if sequence has CDS features, this is a transcript with no UTR
+ * - do not take this as the CDS sequence! (JAL-2789)
+ */
+ if (seqDss.getFeatures().getFeaturesByOntology(SequenceOntologyI.CDS)
+ .isEmpty())
+ {
+ return seqDss;
+ }
+ }
+
+ /*
+ * looks like we found the dna-to-protein mapping; search for the
+ * corresponding cds-to-protein mapping
+ */
+ List<AlignedCodonFrame> mappingsToPeptide = MappingUtils
+ .findMappingsForSequence(proteinProduct, mappings);
+ for (AlignedCodonFrame acf : mappingsToPeptide)
+ {
+ for (SequenceToSequenceMapping map : acf.getMappings())
+ {
+ Mapping mapping = map.getMapping();
+ if (mapping != aMapping
+ && mapping.getMap().getFromRatio() == CODON_LENGTH
+ && proteinProduct == mapping.getTo()
+ && seqDss != map.getFromSeq())
+ {
+ mappedFromLength = MappingUtils
+ .getLength(mapping.getMap().getFromRanges());
+ if (mappedFromLength == map.getFromSeq().getLength())
+ {
+ /*
+ * found a 3:1 mapping to the protein product which covers
+ * the whole dna sequence i.e. is from CDS; finally check the CDS
+ * is mapped from the given dna start sequence
+ */
+ SequenceI cdsSeq = map.getFromSeq();
+ // todo this test is weak if seqMappings contains multiple mappings;
+ // we get away with it if transcript:cds relationship is 1:1
+ List<AlignedCodonFrame> dnaToCdsMaps = MappingUtils
+ .findMappingsForSequence(cdsSeq, seqMappings);
+ if (!dnaToCdsMaps.isEmpty())
+ {
+ return cdsSeq;
+ }
+ }
+ }
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Helper method that makes a CDS sequence as defined by the mappings from the
+ * given sequence i.e. extracts the 'mapped from' ranges (which may be on
+ * forward or reverse strand).
+ *
+ * @param seq
+ * @param mapping
+ * @param dataset
+ * - existing dataset. We check for sequences that look like the CDS
+ * we are about to construct, if one exists already, then we will
+ * just return that one.
+ * @return CDS sequence (as a dataset sequence)
+ */
+ static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping,
+ AlignmentI dataset)
+ {
+ /*
+ * construct CDS sequence name as "CDS|" with 'from id' held in the mapping
+ * if set (e.g. EMBL protein_id), else sequence name appended
+ */
+ String mapFromId = mapping.getMappedFromId();
+ final String seqId = "CDS|"
+ + (mapFromId != null ? mapFromId : seq.getName());
+
+ SequenceI newSeq = null;
+
+ final MapList maplist = mapping.getMap();
+ if (maplist.isContiguous() && maplist.isFromForwardStrand())
+ {
+ /*
+ * just a subsequence, keep same dataset sequence
+ */
+ int start = maplist.getFromLowest();
+ int end = maplist.getFromHighest();
+ newSeq = seq.getSubSequence(start - 1, end);
+ newSeq.setName(seqId);
+ }
+ else
+ {
+ /*
+ * construct by splicing mapped from ranges
+ */
+ char[] seqChars = seq.getSequence();
+ List<int[]> fromRanges = maplist.getFromRanges();
+ int cdsWidth = MappingUtils.getLength(fromRanges);
+ char[] newSeqChars = new char[cdsWidth];
+
+ int newPos = 0;
+ for (int[] range : fromRanges)
+ {
+ if (range[0] <= range[1])
+ {
+ // forward strand mapping - just copy the range
+ int length = range[1] - range[0] + 1;
+ System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos,
+ length);
+ newPos += length;
+ }
+ else
+ {
+ // reverse strand mapping - copy and complement one by one
+ for (int i = range[0]; i >= range[1]; i--)
+ {
+ newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]);
+ }
+ }
+ }
+
+ newSeq = new Sequence(seqId, newSeqChars, 1, newPos);
+ }
+
+ if (dataset != null)
+ {
+ SequenceI[] matches = dataset.findSequenceMatch(newSeq.getName());
+ if (matches != null)
+ {
+ boolean matched = false;
+ for (SequenceI mtch : matches)
+ {
+ if (mtch.getStart() != newSeq.getStart())
+ {
+ continue;
+ }
+ if (mtch.getEnd() != newSeq.getEnd())
+ {
+ continue;
+ }
+ if (!Arrays.equals(mtch.getSequence(), newSeq.getSequence()))
+ {
+ continue;
+ }
+ if (!matched)
+ {
+ matched = true;
+ newSeq = mtch;
+ }
+ else
+ {
+ System.err.println(
+ "JAL-2154 regression: warning - found (and ignnored a duplicate CDS sequence):"
+ + mtch.toString());
+ }
+ }
+ }
+ }
+ // newSeq.setDescription(mapFromId);
+
+ return newSeq;
+ }
+
+ /**
+ * Adds any DBRefEntrys to cdsSeq from contig that have a Mapping congruent to
+ * the given mapping.
+ *
+ * @param cdsSeq
+ * @param contig
+ * @param proteinProduct
+ * @param mapping
+ * @return list of DBRefEntrys added
+ */
+ protected static List<DBRefEntry> propagateDBRefsToCDS(SequenceI cdsSeq,
+ SequenceI contig, SequenceI proteinProduct, Mapping mapping)
+ {
+
+ // gather direct refs from contig congruent with mapping
+ List<DBRefEntry> direct = new ArrayList<>();
+ HashSet<String> directSources = new HashSet<>();
+
+ if (contig.getDBRefs() != null)
+ {
+ for (DBRefEntry dbr : contig.getDBRefs())
+ {
+ if (dbr.hasMap() && dbr.getMap().getMap().isTripletMap())
+ {
+ MapList map = dbr.getMap().getMap();
+ // check if map is the CDS mapping
+ if (mapping.getMap().equals(map))
+ {
+ direct.add(dbr);
+ directSources.add(dbr.getSource());
+ }
+ }
+ }
+ }
+ DBRefEntry[] onSource = DBRefUtils.selectRefs(
+ proteinProduct.getDBRefs(),
+ directSources.toArray(new String[0]));
+ List<DBRefEntry> propagated = new ArrayList<>();
+
+ // and generate appropriate mappings
+ for (DBRefEntry cdsref : direct)
+ {
+ // clone maplist and mapping
+ MapList cdsposmap = new MapList(
+ Arrays.asList(new int[][]
+ { new int[] { cdsSeq.getStart(), cdsSeq.getEnd() } }),
+ cdsref.getMap().getMap().getToRanges(), 3, 1);
+ Mapping cdsmap = new Mapping(cdsref.getMap().getTo(),
+ cdsref.getMap().getMap());
+
+ // create dbref
+ DBRefEntry newref = new DBRefEntry(cdsref.getSource(),
+ cdsref.getVersion(), cdsref.getAccessionId(),
+ new Mapping(cdsmap.getTo(), cdsposmap));
+
+ // and see if we can map to the protein product for this mapping.
+ // onSource is the filtered set of accessions on protein that we are
+ // tranferring, so we assume accession is the same.
+ if (cdsmap.getTo() == null && onSource != null)
+ {
+ List<DBRefEntry> sourceRefs = DBRefUtils.searchRefs(onSource,
+ cdsref.getAccessionId());
+ if (sourceRefs != null)
+ {
+ for (DBRefEntry srcref : sourceRefs)
+ {
+ if (srcref.getSource().equalsIgnoreCase(cdsref.getSource()))
+ {
+ // we have found a complementary dbref on the protein product, so
+ // update mapping's getTo
+ newref.getMap().setTo(proteinProduct);
+ }
+ }
+ }
+ }
+ cdsSeq.addDBRef(newref);
+ propagated.add(newref);
+ }
+ return propagated;
+ }
+
+ /**
+ * Transfers co-located features on 'fromSeq' to 'toSeq', adjusting the
+ * feature start/end ranges, optionally omitting specified feature types.
+ * Returns the number of features copied.
+ *
+ * @param fromSeq
+ * @param toSeq
+ * @param mapping
+ * the mapping from 'fromSeq' to 'toSeq'
+ * @param select
+ * if not null, only features of this type are copied (including
+ * subtypes in the Sequence Ontology)
+ * @param omitting
+ */
+ protected static int transferFeatures(SequenceI fromSeq, SequenceI toSeq,
+ MapList mapping, String select, String... omitting)
+ {
+ SequenceI copyTo = toSeq;
+ while (copyTo.getDatasetSequence() != null)
+ {
+ copyTo = copyTo.getDatasetSequence();
+ }
+ if (fromSeq == copyTo || fromSeq.getDatasetSequence() == copyTo)
+ {
+ return 0; // shared dataset sequence
+ }
+
+ /*
+ * get features, optionally restricted by an ontology term
+ */
+ List<SequenceFeature> sfs = select == null ? fromSeq.getFeatures()
+ .getPositionalFeatures() : fromSeq.getFeatures()
+ .getFeaturesByOntology(select);
+
+ int count = 0;
+ for (SequenceFeature sf : sfs)
+ {
+ String type = sf.getType();
+ boolean omit = false;
+ for (String toOmit : omitting)
+ {
+ if (type.equals(toOmit))
+ {
+ omit = true;
+ }
+ }
+ if (omit)
+ {
+ continue;
+ }
+
+ /*
+ * locate the mapped range - null if either start or end is
+ * not mapped (no partial overlaps are calculated)
+ */
+ int start = sf.getBegin();
+ int end = sf.getEnd();
+ int[] mappedTo = mapping.locateInTo(start, end);
+ /*
+ * if whole exon range doesn't map, try interpreting it
+ * as 5' or 3' exon overlapping the CDS range
+ */
+ if (mappedTo == null)
+ {
+ mappedTo = mapping.locateInTo(end, end);