From 4bf04707b9bb755a7cdaa829eeec02ab87b4a228 Mon Sep 17 00:00:00 2001 From: Jim Procter Date: Mon, 29 Aug 2016 22:20:23 +0100 Subject: [PATCH] =?utf8?q?JAL-2154=20don=E2=80=99t=20synthesise=20multiple=20?= =?utf8?q?CDS|=20sequences=20when=20one=20is=20already=20available=20on?= =?utf8?q?=20dataset=20(fixes=20crossRefs2XML=20failure=20-=20CDS|ENST000002?= =?utf8?q?88602=20found=20duplicated=20"Pass=20(0,0,0):=20before=20start=20o?= =?utf8?q?f=20pass3:=20ENSEMBL=20ENSG00000157764=20->=20UNIPROT{0}:=E2=80=9D?= =?utf8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit --- src/jalview/analysis/AlignmentUtils.java | 50 ++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index e0ec22b..b0a2269 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -1680,15 +1680,20 @@ public class AlignmentUtils * didn't find mapped CDS sequence - construct it and add * its dataset sequence to the dataset */ - cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping); + cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping, + dataset).deriveSequence(); // cdsSeq has a name constructed as CDS| // will be either the accession for the coding sequence, // marked in the /via/ dbref to the protein product accession // or it will be the original nucleotide accession. - SequenceI cdsSeqDss = cdsSeq.createDatasetSequence(); + SequenceI cdsSeqDss = cdsSeq.getDatasetSequence(); + cdsSeqs.add(cdsSeq); + if (!dataset.getSequences().contains(cdsSeqDss)) { + // check if this sequence is a newly created one + // so needs adding to the dataset dataset.addSequence(cdsSeqDss); } @@ -1870,9 +1875,14 @@ public class AlignmentUtils * * @param seq * @param mapping + * @param dataset + * - existing dataset. We check for sequences that look like the CDS + * we are about to construct, if one exists already, then we will + * just return that one. * @return CDS sequence (as a dataset sequence) */ - static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping) + static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping, + AlignmentI dataset) { char[] seqChars = seq.getSequence(); List fromRanges = mapping.getMap().getFromRanges(); @@ -1907,6 +1917,40 @@ public class AlignmentUtils String mapFromId = mapping.getMappedFromId(); String seqId = "CDS|" + (mapFromId != null ? mapFromId : seq.getName()); SequenceI newSeq = new Sequence(seqId, newSeqChars, 1, newPos); + if (dataset != null) + { + SequenceI[] matches = dataset.findSequenceMatch(newSeq.getName()); + if (matches != null) + { + boolean matched = false; + for (SequenceI mtch : matches) + { + if (mtch.getStart() != newSeq.getStart()) + { + continue; + } + if (mtch.getEnd() != newSeq.getEnd()) + { + continue; + } + if (!Arrays.equals(mtch.getSequence(), newSeq.getSequence())) + { + continue; + } + if (!matched) + { + matched = true; + newSeq = mtch; + } + else + { + System.err + .println("JAL-2154 regression: warning - found (and ignnored a duplicate CDS sequence):" + + mtch.toString()); + } + } + } + } // newSeq.setDescription(mapFromId); return newSeq; -- 1.7.10.2