+ /**
+ * Returns a mapping from dna to protein by inspecting sequence features of
+ * type "CDS" on the dna. A mapping is constructed if the total CDS feature
+ * length is 3 times the peptide length (optionally after dropping a trailing
+ * stop codon). This method does not check whether the CDS nucleotide sequence
+ * translates to the peptide sequence.
+ *
+ * @param dnaSeq
+ * @param proteinSeq
+ * @return
+ */
+ public static MapList mapCdsToProtein(SequenceI dnaSeq,
+ SequenceI proteinSeq)
+ {
+ List<int[]> ranges = findCdsPositions(dnaSeq);
+ int mappedDnaLength = MappingUtils.getLength(ranges);
+
+ /*
+ * if not a whole number of codons, truncate mapping
+ */
+ int codonRemainder = mappedDnaLength % CODON_LENGTH;
+ if (codonRemainder > 0)
+ {
+ mappedDnaLength -= codonRemainder;
+ MappingUtils.removeEndPositions(codonRemainder, ranges);
+ }
+
+ int proteinLength = proteinSeq.getLength();
+ int proteinStart = proteinSeq.getStart();
+ int proteinEnd = proteinSeq.getEnd();
+
+ /*
+ * incomplete start codon may mean X at start of peptide
+ * we ignore both for mapping purposes
+ */
+ if (proteinSeq.getCharAt(0) == 'X')
+ {
+ // todo JAL-2022 support startPhase > 0
+ proteinStart++;
+ proteinLength--;
+ }
+ List<int[]> proteinRange = new ArrayList<>();
+
+ /*
+ * dna length should map to protein (or protein plus stop codon)
+ */
+ int codesForResidues = mappedDnaLength / CODON_LENGTH;
+ if (codesForResidues == (proteinLength + 1))
+ {
+ // assuming extra codon is for STOP and not in peptide
+ // todo: check trailing codon is indeed a STOP codon
+ codesForResidues--;
+ mappedDnaLength -= CODON_LENGTH;
+ MappingUtils.removeEndPositions(CODON_LENGTH, ranges);
+ }
+
+ if (codesForResidues == proteinLength)
+ {
+ proteinRange.add(new int[] { proteinStart, proteinEnd });
+ return new MapList(ranges, proteinRange, CODON_LENGTH, 1);
+ }
+ return null;
+ }
+
+ /**
+ * Returns a list of CDS ranges found (as sequence positions base 1), i.e. of
+ * [start, end] positions of sequence features of type "CDS" (or a sub-type of
+ * CDS in the Sequence Ontology). The ranges are sorted into ascending start
+ * position order, so this method is only valid for linear CDS in the same
+ * sense as the protein product.
+ *
+ * @param dnaSeq
+ * @return
+ */
+ protected static List<int[]> findCdsPositions(SequenceI dnaSeq)
+ {
+ List<int[]> result = new ArrayList<>();
+
+ List<SequenceFeature> sfs = dnaSeq.getFeatures()
+ .getFeaturesByOntology(SequenceOntologyI.CDS);
+ if (sfs.isEmpty())
+ {
+ return result;
+ }
+ SequenceFeatures.sortFeatures(sfs, true);
+
+ for (SequenceFeature sf : sfs)
+ {
+ int phase = 0;
+ try
+ {
+ String s = sf.getPhase();
+ if (s != null)
+ {
+ phase = Integer.parseInt(s);
+ }
+ } catch (NumberFormatException e)
+ {
+ // leave as zero
+ }
+ /*
+ * phase > 0 on first codon means 5' incomplete - skip to the start
+ * of the next codon; example ENST00000496384
+ */
+ int begin = sf.getBegin();
+ int end = sf.getEnd();
+ if (result.isEmpty() && phase > 0)
+ {
+ begin += phase;
+ if (begin > end)
+ {
+ // shouldn't happen!
+ System.err
+ .println("Error: start phase extends beyond start CDS in "
+ + dnaSeq.getName());
+ }
+ }
+ result.add(new int[] { begin, end });
+ }
+
+ /*
+ * Finally sort ranges by start position. This avoids a dependency on
+ * keeping features in order on the sequence (if they are in order anyway,
+ * the sort will have almost no work to do). The implicit assumption is CDS
+ * ranges are assembled in order. Other cases should not use this method,
+ * but instead construct an explicit mapping for CDS (e.g. EMBL parsing).
+ */
+ Collections.sort(result, IntRangeComparator.ASCENDING);
+ return result;
+ }
+
+ /**
+ * Makes an alignment with a copy of the given sequences, adding in any
+ * non-redundant sequences which are mapped to by the cross-referenced
+ * sequences.
+ *
+ * @param seqs
+ * @param xrefs
+ * @param dataset
+ * the alignment dataset shared by the new copy
+ * @return
+ */
+ public static AlignmentI makeCopyAlignment(SequenceI[] seqs,
+ SequenceI[] xrefs, AlignmentI dataset)
+ {
+ AlignmentI copy = new Alignment(new Alignment(seqs));
+ copy.setDataset(dataset);
+ boolean isProtein = !copy.isNucleotide();
+ SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
+ if (xrefs != null)
+ {
+ // BH 2019.01.25 recoded to remove iterators
+
+ for (int ix = 0, nx = xrefs.length; ix < nx; ix++)
+ {
+ SequenceI xref = xrefs[ix];
+ List<DBRefEntry> dbrefs = xref.getDBRefs();
+ if (dbrefs != null)