From: Jim Procter Date: Mon, 29 Aug 2016 21:20:23 +0000 (+0100) Subject: JAL-2154 don’t synthesise multiple CDS| sequences when one is already available... X-Git-Tag: Release_2_10_0~47^2~4^2~32 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=4bf04707b9bb755a7cdaa829eeec02ab87b4a228;p=jalview.git JAL-2154 don’t synthesise multiple CDS| sequences when one is already available on dataset (fixes crossRefs2XML failure - CDS|ENST00000288602 found duplicated "Pass (0,0,0): before start of pass3: ENSEMBL ENSG00000157764 -> UNIPROT{0}:”) --- diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java index e0ec22b..b0a2269 100644 --- a/src/jalview/analysis/AlignmentUtils.java +++ b/src/jalview/analysis/AlignmentUtils.java @@ -1680,15 +1680,20 @@ public class AlignmentUtils * didn't find mapped CDS sequence - construct it and add * its dataset sequence to the dataset */ - cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping); + cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping, + dataset).deriveSequence(); // cdsSeq has a name constructed as CDS| // will be either the accession for the coding sequence, // marked in the /via/ dbref to the protein product accession // or it will be the original nucleotide accession. - SequenceI cdsSeqDss = cdsSeq.createDatasetSequence(); + SequenceI cdsSeqDss = cdsSeq.getDatasetSequence(); + cdsSeqs.add(cdsSeq); + if (!dataset.getSequences().contains(cdsSeqDss)) { + // check if this sequence is a newly created one + // so needs adding to the dataset dataset.addSequence(cdsSeqDss); } @@ -1870,9 +1875,14 @@ public class AlignmentUtils * * @param seq * @param mapping + * @param dataset + * - existing dataset. We check for sequences that look like the CDS + * we are about to construct, if one exists already, then we will + * just return that one. * @return CDS sequence (as a dataset sequence) */ - static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping) + static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping, + AlignmentI dataset) { char[] seqChars = seq.getSequence(); List fromRanges = mapping.getMap().getFromRanges(); @@ -1907,6 +1917,40 @@ public class AlignmentUtils String mapFromId = mapping.getMappedFromId(); String seqId = "CDS|" + (mapFromId != null ? mapFromId : seq.getName()); SequenceI newSeq = new Sequence(seqId, newSeqChars, 1, newPos); + if (dataset != null) + { + SequenceI[] matches = dataset.findSequenceMatch(newSeq.getName()); + if (matches != null) + { + boolean matched = false; + for (SequenceI mtch : matches) + { + if (mtch.getStart() != newSeq.getStart()) + { + continue; + } + if (mtch.getEnd() != newSeq.getEnd()) + { + continue; + } + if (!Arrays.equals(mtch.getSequence(), newSeq.getSequence())) + { + continue; + } + if (!matched) + { + matched = true; + newSeq = mtch; + } + else + { + System.err + .println("JAL-2154 regression: warning - found (and ignnored a duplicate CDS sequence):" + + mtch.toString()); + } + } + } + } // newSeq.setDescription(mapFromId); return newSeq;