From: gmungoc Date: Mon, 21 Mar 2016 14:42:51 +0000 (+0000) Subject: JAL-2029 many-to-many EnsemblCDS-to-Uniprot mappings X-Git-Tag: Release_2_10_0~290^2~7 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=4ad19b786f19aeadaf7a841e43ff8e490a39589d;p=jalview.git JAL-2029 many-to-many EnsemblCDS-to-Uniprot mappings --- diff --git a/src/jalview/analysis/CrossRef.java b/src/jalview/analysis/CrossRef.java index 7d09a3b..d48c14a 100644 --- a/src/jalview/analysis/CrossRef.java +++ b/src/jalview/analysis/CrossRef.java @@ -361,6 +361,9 @@ public class CrossRef { updateDbrefMappings(dna, seq, xrfs, retrieved, cf); + SequenceIdMatcher matcher = new SequenceIdMatcher( + dataset.getSequences()); + matcher.addAll(addedPeers); List copiedFeatures = new ArrayList(); CrossRef me = new CrossRef(); for (int rs = 0; rs < retrieved.length; rs++) @@ -378,8 +381,16 @@ public class CrossRef { if (map.getTo() != null && map.getMap() != null) { - // should search the local dataset to find any existing - // candidates for To ! + SequenceI matched = matcher + .findIdMatch(map.getTo()); + if (matched != null) + { + map.setTo(matched); + } + else + { + matcher.add(map.getTo()); + } try { // compare ms with dss and replace with dss in mapping @@ -433,7 +444,10 @@ public class CrossRef } else { - addedPeers.add(map.getTo()); + if (!addedPeers.contains(map.getTo())) + { + addedPeers.add(map.getTo()); + } cf.addMap(retrieved[rs].getDatasetSequence(), map.getTo(), map.getMap()); } diff --git a/src/jalview/analysis/SequenceIdMatcher.java b/src/jalview/analysis/SequenceIdMatcher.java index b89287c..70defb0 100755 --- a/src/jalview/analysis/SequenceIdMatcher.java +++ b/src/jalview/analysis/SequenceIdMatcher.java @@ -46,7 +46,7 @@ public class SequenceIdMatcher } /** - * add more sequences to this matcher - also used by the constructor + * Adds sequences to this matcher * * @param seqs */ @@ -54,26 +54,36 @@ public class SequenceIdMatcher { for (SequenceI seq : seqs) { - // TODO: deal with ID collisions - SequenceI should be appended to list - // associated with this key. - names.put(new SeqIdName(seq.getDisplayId(true)), seq); - SequenceI dbseq = seq; - while (dbseq.getDatasetSequence() != null) - { - dbseq = dbseq.getDatasetSequence(); - } - // add in any interesting identifiers - if (dbseq.getDBRefs() != null) + add(seq); + } + } + + /** + * Adds one sequence to this matcher + * + * @param seq + */ + public void add(SequenceI seq) + { + // TODO: deal with ID collisions - SequenceI should be appended to list + // associated with this key. + names.put(new SeqIdName(seq.getDisplayId(true)), seq); + SequenceI dbseq = seq; + while (dbseq.getDatasetSequence() != null) + { + dbseq = dbseq.getDatasetSequence(); + } + // add in any interesting identifiers + if (dbseq.getDBRefs() != null) + { + DBRefEntry dbr[] = dbseq.getDBRefs(); + SeqIdName sid = null; + for (int r = 0; r < dbr.length; r++) { - DBRefEntry dbr[] = dbseq.getDBRefs(); - SeqIdName sid = null; - for (int r = 0; r < dbr.length; r++) + sid = new SeqIdName(dbr[r].getAccessionId()); + if (!names.containsKey(sid)) { - sid = new SeqIdName(dbr[r].getAccessionId()); - if (!names.containsKey(sid)) - { - names.put(sid, seq); - } + names.put(sid, seq); } } } diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index 87e2789..7da6d6c 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -20,6 +20,7 @@ */ package jalview.datamodel.xdb.embl; +import jalview.analysis.SequenceIdMatcher; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.FeatureProperties; @@ -27,21 +28,32 @@ import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; +import jalview.util.DBRefUtils; +import jalview.util.MapList; +import jalview.util.MappingUtils; +import jalview.util.StringUtils; import java.util.Hashtable; +import java.util.List; +import java.util.Map; import java.util.Map.Entry; import java.util.Vector; +import java.util.regex.Pattern; /** * Data model for one entry returned from an EMBL query, as marshalled by a * Castor binding file * - * For example: http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/embl/x53828/emblxml + * For example: + * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ena_sequence&id=J03321 + * &format=emblxml * * @see embl_mapping.xml */ public class EmblEntry { + private static final Pattern SPACE_PATTERN = Pattern.compile(" "); + String accession; String version; @@ -251,207 +263,48 @@ public class EmblEntry this.version = version; } - /* - * EMBL Feature support is limited. The text below is included for the benefit - * of any developer working on improving EMBL feature import in Jalview. - * Extract from EMBL feature specification see - * http://www.embl-ebi.ac.uk/embl/Documentation - * /FT_definitions/feature_table.html 3.5 Location 3.5.1 Purpose - * - * The location indicates the region of the presented sequence which - * corresponds to a feature. - * - * 3.5.2 Format and conventions The location contains at least one sequence - * location descriptor and may contain one or more operators with one or more - * sequence location descriptors. Base numbers refer to the numbering in the - * entry. This numbering designates the first base (5' end) of the presented - * sequence as base 1. Base locations beyond the range of the presented - * sequence may not be used in location descriptors, the only exception being - * location in a remote entry (see 3.5.2.1, e). - * - * Location operators and descriptors are discussed in more detail below. - * - * 3.5.2.1 Location descriptors - * - * The location descriptor can be one of the following: (a) a single base - * number (b) a site between two indicated adjoining bases (c) a single base - * chosen from within a specified range of bases (not allowed for new entries) - * (d) the base numbers delimiting a sequence span (e) a remote entry - * identifier followed by a local location descriptor (i.e., a-d) - * - * A site between two adjoining nucleotides, such as endonucleolytic cleavage - * site, is indicated by listing the two points separated by a carat (^). The - * permitted formats for this descriptor are n^n+1 (for example 55^56), or, - * for circular molecules, n^1, where "n" is the full length of the molecule, - * ie 1000^1 for circular molecule with length 1000. - * - * A single base chosen from a range of bases is indicated by the first base - * number and the last base number of the range separated by a single period - * (e.g., '12.21' indicates a single base taken from between the indicated - * points). From October 2006 the usage of this descriptor is restricted : it - * is illegal to use "a single base from a range" (c) either on its own or in - * combination with the "sequence span" (d) descriptor for newly created - * entries. The existing entries where such descriptors exist are going to be - * retrofitted. - * - * Sequence spans are indicated by the starting base number and the ending - * base number separated by two periods (e.g., '34..456'). The '<' and '>' - * symbols may be used with the starting and ending base numbers to indicate - * that an end point is beyond the specified base number. The starting and - * ending base positions can be represented as distinct base numbers - * ('34..456') or a site between two indicated adjoining bases. - * - * A location in a remote entry (not the entry to which the feature table - * belongs) can be specified by giving the accession-number and sequence - * version of the remote entry, followed by a colon ":", followed by a - * location descriptor which applies to that entry's sequence (i.e. - * J12345.1:1..15, see also examples below) - * - * 3.5.2.2 Operators - * - * The location operator is a prefix that specifies what must be done to the - * indicated sequence to find or construct the location corresponding to the - * feature. A list of operators is given below with their definitions and most - * common format. - * - * complement(location) Find the complement of the presented sequence in the - * span specified by " location" (i.e., read the complement of the presented - * strand in its 5'-to-3' direction) - * - * join(location,location, ... location) The indicated elements should be - * joined (placed end-to-end) to form one contiguous sequence - * - * order(location,location, ... location) The elements can be found in the - * specified order (5' to 3' direction), but nothing is implied about the - * reasonableness about joining them - * - * Note : location operator "complement" can be used in combination with - * either " join" or "order" within the same location; combinations of "join" - * and "order" within the same location (nested operators) are illegal. - * - * - * - * 3.5.3 Location examples - * - * The following is a list of common location descriptors with their meanings: - * - * Location Description - * - * 467 Points to a single base in the presented sequence - * - * 340..565 Points to a continuous range of bases bounded by and including the - * starting and ending bases - * - * <345..500 Indicates that the exact lower boundary point of a feature is - * unknown. The location begins at some base previous to the first base - * specified (which need not be contained in the presented sequence) and - * continues to and includes the ending base - * - * <1..888 The feature starts before the first sequenced base and continues to - * and includes base 888 - * - * 1..>888 The feature starts at the first sequenced base and continues beyond - * base 888 - * - * 102.110 Indicates that the exact location is unknown but that it is one of - * the bases between bases 102 and 110, inclusive - * - * 123^124 Points to a site between bases 123 and 124 - * - * join(12..78,134..202) Regions 12 to 78 and 134 to 202 should be joined to - * form one contiguous sequence - * - * - * complement(34..126) Start at the base complementary to 126 and finish at - * the base complementary to base 34 (the feature is on the strand - * complementary to the presented strand) - * - * - * complement(join(2691..4571,4918..5163)) Joins regions 2691 to 4571 and 4918 - * to 5163, then complements the joined segments (the feature is on the strand - * complementary to the presented strand) - * - * join(complement(4918..5163),complement(2691..4571)) Complements regions - * 4918 to 5163 and 2691 to 4571, then joins the complemented segments (the - * feature is on the strand complementary to the presented strand) - * - * J00194.1:100..202 Points to bases 100 to 202, inclusive, in the entry (in - * this database) with primary accession number 'J00194' - * - * join(1..100,J00194.1:100..202) Joins region 1..100 of the existing entry - * with the region 100..202 of remote entry J00194 - */ /** * Recover annotated sequences from EMBL file * - * @param noNa - * don't return nucleic acid sequences * @param sourceDb - * TODO - * @param noProtein - * don't return any translated protein sequences marked in features - * @return dataset sequences with DBRefs and features - DNA always comes first + * @param peptides + * a list of protein products found so far (to add to) + * @return dna dataset sequence with DBRefs and features */ - public jalview.datamodel.SequenceI[] getSequences(boolean noNa, - boolean noPeptide, String sourceDb) - { // TODO: ensure emblEntry.getSequences behaves correctly for returning all - // cases of noNa and noPeptide - Vector seqs = new Vector(); - Sequence dna = null; - if (!noNa) + public SequenceI getSequence(String sourceDb, List peptides) + { + SequenceI dna = new Sequence(sourceDb + "|" + accession, + sequence.getSequence()); + dna.setDescription(desc); + DBRefEntry retrievedref = new DBRefEntry(sourceDb, version, accession); + dna.addDBRef(retrievedref); + // add map to indicate the sequence is a valid coordinate frame for the + // dbref + retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() }, + new int[] { 1, dna.getLength() }, 1, 1)); + // TODO: transform EMBL Database refs to canonical form + if (dbRefs != null) { - // In theory we still need to create this if noNa is set to avoid a null - // pointer exception - dna = new Sequence(sourceDb + "|" + accession, sequence.getSequence()); - dna.setDescription(desc); - DBRefEntry retrievedref = new DBRefEntry(sourceDb, version, accession); - dna.addDBRef(retrievedref); - // add map to indicate the sequence is a valid coordinate frame for the - // dbref - retrievedref.setMap(new Mapping(null, - new int[] { 1, dna.getLength() }, new int[] { 1, - dna.getLength() }, 1, 1)); - // TODO: transform EMBL Database refs to canonical form - if (dbRefs != null) + for (DBRefEntry dbref : dbRefs) { - for (DBRefEntry dbref : dbRefs) - { - dna.addDBRef(dbref); - } + dna.addDBRef(dbref); } } + try { for (EmblFeature feature : features) { - if (!noNa) + if (feature.dbRefs != null) { - if (feature.dbRefs != null) + for (DBRefEntry dbref : feature.dbRefs) { - for (DBRefEntry dbref : feature.dbRefs) - { - dna.addDBRef(dbref); - } + dna.addDBRef(dbref); } } if (FeatureProperties.isCodingFeature(sourceDb, feature.getName())) { - parseCodingFeature(feature, sourceDb, seqs, dna, noPeptide); - } - else - { - // General feature type. - // TODO this is just duplicated code ?? - if (!noNa) - { - if (feature.dbRefs != null) - { - for (DBRefEntry dbref : feature.dbRefs) - { - dna.addDBRef(dbref); - } - } - } + parseCodingFeature(feature, sourceDb, dna, peptides); } } } catch (Exception e) @@ -463,65 +316,46 @@ public class EmblEntry System.err.println("Resulted in exception: " + e.getMessage()); e.printStackTrace(System.err); } - if (!noNa && dna != null) - { - seqs.add(dna); - } - SequenceI[] sqs = new SequenceI[seqs.size()]; - for (int i = 0, j = seqs.size(); i < j; i++) - { - sqs[i] = seqs.elementAt(i); - seqs.set(i, null); - } - return sqs; + + return dna; } /** - * attempt to extract coding region and product from a feature and properly - * decorate it with annotations. + * Extracts coding region and product from a CDS feature and properly decorate + * it with annotations. * * @param feature * coding feature * @param sourceDb * source database for the EMBLXML - * @param seqs - * place where sequences go * @param dna * parent dna sequence for this record - * @param noPeptide - * flag for generation of Peptide sequence objects + * @param peptides + * list of protein product sequences for Embl entry */ - private void parseCodingFeature(EmblFeature feature, String sourceDb, - Vector seqs, Sequence dna, boolean noPeptide) + void parseCodingFeature(EmblFeature feature, String sourceDb, + SequenceI dna, List peptides) { boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS); - // extract coding region(s) - jalview.datamodel.Mapping map = null; - int[] exon = null; - if (feature.locations != null) - { - for (EmblFeatureLocations loc : feature.locations) - { - int[] se = loc.getElementRanges(accession); - if (exon == null) - { - exon = se; - } - else - { - int[] t = new int[exon.length + se.length]; - System.arraycopy(exon, 0, t, 0, exon.length); - System.arraycopy(se, 0, t, exon.length, se.length); - exon = t; - } - } - } + + int[] exon = getCdsRanges(feature); + String prseq = null; - String prname = new String(); + String prname = ""; String prid = null; - Hashtable vals = new Hashtable(); - int prstart = 1; - // get qualifiers + Map vals = new Hashtable(); + SequenceIdMatcher matcher = new SequenceIdMatcher(peptides); + + /* + * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS + * (phase is required for CDS features in GFF3 format) + */ + int codonStart = 1; + + /* + * parse qualifiers, saving protein translation, protein id, + * codon start position, product (name), and 'other values' + */ if (feature.getQualifiers() != null) { for (Qualifier q : feature.getQualifiers()) @@ -529,16 +363,8 @@ public class EmblEntry String qname = q.getName(); if (qname.equals("translation")) { - StringBuilder prsq = new StringBuilder(q.getValues()[0]); - int p = prsq.indexOf(" "); - while (p > -1) - { - prsq.deleteCharAt(p); - p = prsq.indexOf(" ", p); - } - prseq = prsq.toString(); - prsq = null; - + // remove all spaces (precompiled String.replaceAll(" ", "")) + prseq = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll(""); } else if (qname.equals("protein_id")) { @@ -546,46 +372,57 @@ public class EmblEntry } else if (qname.equals("codon_start")) { - prstart = Integer.parseInt(q.getValues()[0]); + try + { + codonStart = Integer.parseInt(q.getValues()[0]); + } catch (NumberFormatException e) + { + System.err.println("Invalid codon_start in XML for " + + accession + ": " + e.getMessage()); + } } else if (qname.equals("product")) { + // sometimes name is returned e.g. for V00488 prname = q.getValues()[0]; } else { // throw anything else into the additional properties hash - String[] s = q.getValues(); - StringBuilder sb = new StringBuilder(); - if (s != null) + String[] qvals = q.getValues(); + if (qvals != null) { - for (int i = 0; i < s.length; i++) - { - sb.append(s[i]); - sb.append("\n"); - } + String commaSeparated = StringUtils.arrayToSeparatorList(qvals, + ","); + vals.put(qname, commaSeparated); } - vals.put(qname, sb.toString()); } } } - Sequence product = null; + + // SequenceI product = null; DBRefEntry protEMBLCDS = null; - exon = adjustForPrStart(prstart, exon); + exon = MappingUtils.removeStartPositions(codonStart - 1, exon); boolean noProteinDbref = true; + SequenceI product = null; + Mapping map = null; if (prseq != null && prname != null && prid != null) { - // extract proteins. - product = new Sequence(prid, prseq, 1, prseq.length()); - product.setDescription(((prname.length() == 0) ? "Protein Product from " - + sourceDb - : prname)); - if (!noPeptide) + /* + * look for product in peptides list, if not found, add it + */ + product = matcher.findIdMatch(prid); + if (product == null) { - // Protein is also added to vector of sequences returned - seqs.add(product); + product = new Sequence(prid, prseq, 1, prseq.length()); + product.setDescription(((prname.length() == 0) ? "Protein Product from " + + sourceDb + : prname)); + peptides.add(product); + matcher.add(product); } + // we have everything - create the mapping and perhaps the protein // sequence if (exon == null || exon.length == 0) @@ -593,24 +430,24 @@ public class EmblEntry System.err .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect (" + sourceDb + ":" + getAccession() + ")"); - if (prseq.length() * 3 == (1 - prstart + dna.getSequence().length)) + if (prseq.length() * 3 == (1 - codonStart + dna.getSequence().length)) { System.err .println("Not allowing for additional stop codon at end of cDNA fragment... !"); // this might occur for CDS sequences where no features are // marked. - exon = new int[] { dna.getStart() + (prstart - 1), dna.getEnd() }; - map = new jalview.datamodel.Mapping(product, exon, new int[] { 1, - prseq.length() }, 3, 1); + exon = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() }; + map = new Mapping(product, exon, new int[] { 1, prseq.length() }, + 3, 1); } - if ((prseq.length() + 1) * 3 == (1 - prstart + dna.getSequence().length)) + if ((prseq.length() + 1) * 3 == (1 - codonStart + dna.getSequence().length)) { System.err .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!"); - exon = new int[] { dna.getStart() + (prstart - 1), + exon = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() - 3 }; - map = new jalview.datamodel.Mapping(product, exon, new int[] { 1, - prseq.length() }, 3, 1); + map = new Mapping(product, exon, new int[] { 1, prseq.length() }, + 3, 1); } } else @@ -628,11 +465,13 @@ public class EmblEntry } else { - // final product length trunctation check - - map = new jalview.datamodel.Mapping(product, - adjustForProteinLength(prseq.length(), exon), new int[] { - 1, prseq.length() }, 3, 1); + // final product length truncation check + // TODO should from range include stop codon even if not in protein + // in order to include stop codon in CDS sequence (as done for + // Ensembl)? + int[] cdsRanges = adjustForProteinLength(prseq.length(), + exon); + map = new Mapping(product, cdsRanges, new int[] { 1, prseq.length() }, 3, 1); // reconstruct the EMBLCDS entry // TODO: this is only necessary when there codon annotation is // complete (I think JBPNote) @@ -640,12 +479,9 @@ public class EmblEntry pcdnaref.setAccessionId(prid); pcdnaref.setSource(DBRefSource.EMBLCDS); pcdnaref.setVersion(getVersion()); // same as parent EMBL version. - jalview.util.MapList mp = new jalview.util.MapList(new int[] { 1, - prseq.length() }, new int[] { 1 + (prstart - 1), - (prstart - 1) + 3 * prseq.length() }, 1, 3); - // { 1 + (prstart - 1) * 3, - // 1 + (prstart - 1) * 3 + prseq.length() * 3 - 1 }, new int[] - // { 1prstart, prstart + prseq.length() - 1 }, 3, 1); + MapList mp = new MapList(new int[] { 1, prseq.length() }, + new int[] { 1 + (codonStart - 1), + (codonStart - 1) + 3 * prseq.length() }, 1, 3); pcdnaref.setMap(new Mapping(mp)); if (product != null) { @@ -653,55 +489,60 @@ public class EmblEntry protEMBLCDS = new DBRefEntry(pcdnaref); protEMBLCDS.setSource(DBRefSource.EMBLCDSProduct); product.addDBRef(protEMBLCDS); - } - } } // add cds feature to dna seq - this may include the stop codon for (int xint = 0; exon != null && xint < exon.length; xint += 2) { - SequenceFeature sf = new SequenceFeature(); - sf.setBegin(exon[xint]); - sf.setEnd(exon[xint + 1]); - sf.setType(feature.getName()); + SequenceFeature sf = makeCdsFeature(exon, xint, prname, prid, vals, + codonStart); + sf.setType(feature.getName()); // "CDS" sf.setFeatureGroup(sourceDb); - sf.setDescription("Exon " + (1 + xint / 2) + " for protein '" - + prname + "' EMBLCDS:" + prid); - sf.setValue(FeatureProperties.EXONPOS, new Integer(1 + xint)); - sf.setValue(FeatureProperties.EXONPRODUCT, prname); - if (vals != null) - { - for (Entry val : vals.entrySet()) - { - sf.setValue(val.getKey(), val.getValue()); - } - } dna.addSequenceFeature(sf); } } // add dbRefs to sequence if (feature.dbRefs != null) { + boolean productMapped = false; for (DBRefEntry ref : feature.dbRefs) { - ref.setSource(jalview.util.DBRefUtils.getCanonicalName(ref - .getSource())); + ref.setSource(DBRefUtils.getCanonicalName(ref.getSource())); // Hard code the kind of protein product accessions that EMBL cite - if (ref.getSource().equals(jalview.datamodel.DBRefSource.UNIPROT)) + if (ref.getSource().equals(DBRefSource.UNIPROT)) { + String refSeqName = DBRefSource.UNIPROT + "|" + + ref.getAccessionId(); ref.setMap(map); if (map != null && map.getTo() != null) { - map.getTo().addDBRef( - new DBRefEntry(ref.getSource(), ref.getVersion(), ref - .getAccessionId())); // don't copy map over. - if (map.getTo().getName().indexOf(prid) == 0) - { - map.getTo().setName( - jalview.datamodel.DBRefSource.UNIPROT + "|" - + ref.getAccessionId()); - } + // if (!productMapped) + // { + // map.getTo().setName(refSeqName); + // map.getTo().addDBRef( + // new DBRefEntry(ref.getSource(), ref.getVersion(), ref + // .getAccessionId())); // don't copy map over. + // // if (map.getTo().getName().startsWith(prid)) + // productMapped = true; + // } + // else + // { + /* + * an alternate UNIPROT product for CDS - same mapping + * but to a sequence with a different name + */ + SequenceI newSeq = matcher.findIdMatch(refSeqName); + if (newSeq == null) + { + newSeq = new Sequence(refSeqName, map.getTo() + .getSequenceAsString()); + matcher.add(newSeq); + peptides.add(newSeq); + } + Mapping newMap = new Mapping(newSeq, map.getMap()); + ref.setMap(newMap); + // } } noProteinDbref = false; } @@ -756,39 +597,86 @@ public class EmblEntry } } - private int[] adjustForPrStart(int prstart, int[] exon) + /** + * Helper method to construct a SequenceFeature for one cds range + * + * @param exons + * array of cds [start, end, ...] positions + * @param exonStartIndex + * offset into the exons array + * @param proteinName + * @param proteinAccessionId + * @param vals + * map of 'miscellaneous values' for feature + * @param codonStart + * codon start position for CDS (1/2/3, normally 1) + * @return + */ + protected SequenceFeature makeCdsFeature(int[] exons, int exonStartIndex, + String proteinName, String proteinAccessionId, + Map vals, int codonStart) { - - int origxon[], sxpos = -1; - int sxstart, sxstop; // unnecessary variables used for debugging - // first adjust range for codon start attribute - if (prstart > 1) + int exonNumber = exonStartIndex / 2 + 1; + SequenceFeature sf = new SequenceFeature(); + sf.setBegin(Math.min(exons[exonStartIndex], exons[exonStartIndex + 1])); + sf.setEnd(Math.max(exons[exonStartIndex], exons[exonStartIndex + 1])); + sf.setDescription(String.format( + "Exon %d for protein '%s' EMBLCDS:%s", exonNumber, proteinName, + proteinAccessionId)); + sf.setPhase(String.valueOf(codonStart - 1)); + sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+" : "-"); + sf.setValue(FeatureProperties.EXONPOS, exonNumber); + sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); + if (!vals.isEmpty()) { - origxon = new int[exon.length]; - System.arraycopy(exon, 0, origxon, 0, exon.length); - int cdspos = 0; - for (int x = 0; x < exon.length && sxpos == -1; x += 2) + StringBuilder sb = new StringBuilder(); + boolean first = true; + for (Entry val : vals.entrySet()) { - cdspos += exon[x + 1] - exon[x] + 1; - if (prstart <= cdspos) + if (!first) { - sxpos = x; - sxstart = exon[x]; - sxstop = exon[x + 1]; - // and adjust start boundary of first exon. - exon[x] = exon[x + 1] - cdspos + prstart; - break; + sb.append(";"); } + sb.append(val.getKey()).append("=").append(val.getValue()); + first = false; + sf.setValue(val.getKey(), val.getValue()); } + sf.setAttributes(sb.toString()); + } + return sf; + } - if (sxpos > 0) - { - int[] nxon = new int[exon.length - sxpos]; - System.arraycopy(exon, sxpos, nxon, 0, exon.length - sxpos); - exon = nxon; - } + /** + * Returns the CDS positions as a list of [start, end, start, end...] + * positions. If on the reverse strand, these will be in descending order. + * + * @param feature + * @return + */ + protected int[] getCdsRanges(EmblFeature feature) + { + if (feature.locations == null) + { + return new int[] {}; } - return exon; + int cdsBoundaryCount = 0; // count of all start/stop locations + int[][] cdsLocations = new int[feature.locations.size()][]; + int locationNumber = 0; + for (EmblFeatureLocations loc : feature.locations) + { + int[] locationRanges = loc.getElementRanges(accession); + cdsLocations[locationNumber++] = locationRanges; + cdsBoundaryCount += locationRanges.length; + } + int[] cdsRanges = new int[cdsBoundaryCount]; + int copyTo = 0; + for (int[] ranges : cdsLocations) + { + System.arraycopy(ranges, 0, cdsRanges, copyTo, ranges.length); + copyTo += ranges.length; + } + return cdsRanges; + } /** @@ -802,7 +690,6 @@ public class EmblEntry { int origxon[], sxpos = -1, endxon = 0, cdslength = prlength * 3; - int sxstart, sxstop; // unnecessary variables used for debugging // first adjust range for codon start attribute if (prlength >= 1 && exon != null) { @@ -811,13 +698,11 @@ public class EmblEntry int cdspos = 0; for (int x = 0; x < exon.length && sxpos == -1; x += 2) { - cdspos += exon[x + 1] - exon[x] + 1; + cdspos += Math.abs(exon[x + 1] - exon[x]) + 1; if (cdslength <= cdspos) { // advanced beyond last codon. sxpos = x; - sxstart = exon[x]; - sxstop = exon[x + 1]; if (cdslength != cdspos) { System.err diff --git a/src/jalview/datamodel/xdb/embl/EmblFeatureLocations.java b/src/jalview/datamodel/xdb/embl/EmblFeatureLocations.java index eb0bee7..9774004 100644 --- a/src/jalview/datamodel/xdb/embl/EmblFeatureLocations.java +++ b/src/jalview/datamodel/xdb/embl/EmblFeatureLocations.java @@ -20,13 +20,18 @@ */ package jalview.datamodel.xdb.embl; +import jalview.bin.Cache; +import jalview.util.ArrayUtils; + +import java.util.Arrays; import java.util.Vector; /** - * Data model for a <loctaion> child element of a <feature> read + * Data model for a <location> child element of a <feature> read * from an EMBL query reply * * @see embl_mapping.xml + * @see http://www.insdc.org/files/feature_table.html#3.4.2 */ public class EmblFeatureLocations { @@ -101,21 +106,21 @@ public class EmblFeatureLocations } /** - * Return all location elements concerning given accession as start-end pairs - * TODO: pass back complement and 'less than or more than' range information - * TODO: deal with multiple accessions + * Return all location elements concerning given accession as start-end pairs. + * If the CDS feature is on the forward strand, then start <= end, if on the + * reverse strand then start > end. * * @param accession * the accession string for which locations are requested, or null * for all locations - * @return null or int[] { start1, end1, ... } + * @return int[] { start1, end1, ... } */ - - public int[] getElementRanges(String accession) + int[] getElementRanges(String accession) { int sepos = 0; int[] se = new int[locElements.size() * 2]; - if (locationType.equalsIgnoreCase("single")) // TODO: or "simple" ? + if ("single".equalsIgnoreCase(locationType) + || "join".equalsIgnoreCase(locationType)) { for (EmblFeatureLocElement loce : locElements) { @@ -125,50 +130,61 @@ public class EmblFeatureLocations BasePosition bp[] = loce.getBasePositions(); if (bp.length == 2) { - se[sepos++] = Integer.parseInt(bp[0].getPos()); - se[sepos++] = Integer.parseInt(bp[1].getPos()); + try + { + int start = Integer.parseInt(bp[0].getPos()); + int end = Integer.parseInt(bp[1].getPos()); + se[sepos++] = start; + se[sepos++] = end; + } catch (NumberFormatException e) + { + System.err + .println("format error in EMBL CDS location basePosition: " + + e.getMessage()); + } } - } - } - } - else if (locationType.equalsIgnoreCase("join")) - { - for (EmblFeatureLocElement loce : locElements) - { - if (accession == null || loce.accession != null - && accession.equals(loce.accession)) - { - BasePosition bp[] = loce.getBasePositions(); - if (bp.length == 2) + else { - se[sepos++] = Integer.parseInt(bp[0].getPos()); - se[sepos++] = Integer.parseInt(bp[1].getPos()); + System.err + .println("format error in EMBL CDS location, basePosition count = " + + bp.length); } } } - return se; } else if (locationType != null) { - if (jalview.bin.Cache.log != null) + if (Cache.log != null) { - jalview.bin.Cache.log - .error("EmbleFeatureLocations.getElementRanges cannot deal with locationType=='" + Cache.log + .error("EmblFeatureLocations.getElementRanges cannot deal with locationType=='" + locationType + "'"); } else { System.err - .println("EmbleFeatureLocations.getElementRanges cannot deal with locationType=='" + .println("EmblFeatureLocations.getElementRanges cannot deal with locationType=='" + locationType + "'"); } } - // trim range if necessary. - if (se != null && sepos != se.length) + + if (sepos != se.length) + { + /* + * we failed to parse something - trim off null values + */ + se = Arrays.copyOf(se, sepos); + } + + /* + * If on the complement, reverse the ranges to [end, start, ...end1, start1]. + * For an example of a joined complement, see (tRNA feature) CAGL0B00165r on + * http://www.ebi.ac.uk/ena/data/view/CR380948&display=xml + * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/embl/CR380948/emblxml + */ + if (locationComplement) { - int[] trimmed = new int[sepos]; - System.arraycopy(se, 0, trimmed, 0, sepos); - se = trimmed; + ArrayUtils.reverseIntArray(se); } return se; } diff --git a/src/jalview/util/ArrayUtils.java b/src/jalview/util/ArrayUtils.java new file mode 100644 index 0000000..92085c3 --- /dev/null +++ b/src/jalview/util/ArrayUtils.java @@ -0,0 +1,27 @@ +package jalview.util; + +public class ArrayUtils +{ + /** + * Reverse the given array 'in situ' + * + * @param arr + */ + public static void reverseIntArray(int[] arr) + { + if (arr != null) + { + /* + * swap [k] with [end-k] up to the half way point in the array + * if length is odd, the middle entry is left untouched by the excitement + */ + int last = arr.length - 1; + for (int k = 0; k < arr.length / 2; k++) + { + int temp = arr[k]; + arr[k] = arr[last - k]; + arr[last - k] = temp; + } + } + } +} diff --git a/src/jalview/util/MappingUtils.java b/src/jalview/util/MappingUtils.java index 0780b2a..c2cad1f 100644 --- a/src/jalview/util/MappingUtils.java +++ b/src/jalview/util/MappingUtils.java @@ -821,4 +821,66 @@ public final class MappingUtils } return false; } + + /** + * Removes a specified number of positions from the start of a ranges list. + * For example, could be used to adjust cds ranges to allow for an incomplete + * start codon. Subranges are removed completely, or their start positions + * adjusted, until the required number of positions has been removed from the + * range. Reverse strand ranges are supported. The input array is not + * modified. + * + * @param removeCount + * @param ranges + * an array of [start, end, start, end...] positions + * @return a new array with the first removeCount positions removed + */ + public static int[] removeStartPositions(int removeCount, + final int[] ranges) + { + if (removeCount <= 0) + { + return ranges; + } + + int[] copy = Arrays.copyOf(ranges, ranges.length); + int sxpos = -1; + int cdspos = 0; + for (int x = 0; x < copy.length && sxpos == -1; x += 2) + { + // fixme handle reverse strand + cdspos += Math.abs(copy[x + 1] - copy[x]) + 1; + if (removeCount < cdspos) + { + /* + * we have removed enough, time to finish + */ + sxpos = x; + + /* + * increment start of first exon, or decrement if reverse strand + */ + if (copy[x] <= copy[x + 1]) + { + copy[x] = copy[x + 1] - cdspos + removeCount + 1; + } + else + { + copy[x] = copy[x + 1] + cdspos - removeCount - 1; + } + break; + } + } + + if (sxpos > 0) + { + /* + * we dropped at least one entire sub-range - compact the array + */ + int[] nxon = new int[copy.length - sxpos]; + System.arraycopy(copy, sxpos, nxon, 0, copy.length - sxpos); + return nxon; + } + return copy; + } } diff --git a/src/jalview/ws/dbsources/EmblXmlSource.java b/src/jalview/ws/dbsources/EmblXmlSource.java index 66ebe1b..4041606 100644 --- a/src/jalview/ws/dbsources/EmblXmlSource.java +++ b/src/jalview/ws/dbsources/EmblXmlSource.java @@ -29,14 +29,15 @@ import jalview.util.MessageManager; import jalview.ws.ebi.EBIFetchClient; import java.io.File; +import java.util.ArrayList; +import java.util.List; public abstract class EmblXmlSource extends EbiFileRetrievedProxy { - - /** - * Last properly parsed embl file. + /* + * JAL-1856 Embl returns this text for query not found */ - public EmblFile efile = null; + private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found."; public EmblXmlSource() { @@ -88,68 +89,36 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy public AlignmentI getEmblSequenceRecords(String emprefx, String query, File reply) throws Exception { - SequenceI seqs[] = null; - StringBuffer result = new StringBuffer(); + EmblFile efile = null; + List seqs = new ArrayList(); + if (reply != null && reply.exists()) { - efile = null; file = reply.getAbsolutePath(); - if (reply.length() > 25) + if (reply.length() > EMBL_NOT_FOUND_REPLY.length()) { efile = EmblFile.getEmblFile(reply); } - else - { - result.append(MessageManager.formatMessage( - "label.no_embl_record_found", - new String[] { emprefx.toLowerCase(), query.trim() })); - } } + + List peptides = new ArrayList(); if (efile != null) { for (EmblEntry entry : efile.getEntries()) { - SequenceI[] seqparts = entry.getSequences(false, true, emprefx); - // TODO: use !fetchNa,!fetchPeptide here instead - see todo in EmblEntry - if (seqparts != null) + SequenceI seq = entry.getSequence(emprefx, peptides); + if (seq != null) { - SequenceI[] newseqs = null; - int si = 0; - if (seqs == null) - { - newseqs = new SequenceI[seqparts.length]; - } - else - { - newseqs = new SequenceI[seqs.length + seqparts.length]; - - for (; si < seqs.length; si++) - { - newseqs[si] = seqs[si]; - seqs[si] = null; - } - } - for (int j = 0; j < seqparts.length; si++, j++) - { - newseqs[si] = seqparts[j].deriveSequence(); - // place DBReferences on dataset and refer - } - seqs = newseqs; - + seqs.add(seq.deriveSequence()); + // place DBReferences on dataset and refer } } } - else - { - result = null; - } + AlignmentI al = null; - if (seqs != null && seqs.length > 0) + if (!seqs.isEmpty()) { - al = new Alignment(seqs); - result.append(MessageManager.formatMessage( - "label.embl_successfully_parsed", new String[] { emprefx })); - results = result; + al = new Alignment(seqs.toArray(new SequenceI[seqs.size()])); } stopQuery(); return al; diff --git a/test/jalview/datamodel/xdb/embl/EmblEntryTest.java b/test/jalview/datamodel/xdb/embl/EmblEntryTest.java new file mode 100644 index 0000000..9fffc45 --- /dev/null +++ b/test/jalview/datamodel/xdb/embl/EmblEntryTest.java @@ -0,0 +1,308 @@ +package jalview.datamodel.xdb.embl; + +import static org.testng.AssertJUnit.assertEquals; +import static org.testng.AssertJUnit.assertSame; + +import jalview.util.MappingUtils; + +import java.util.Arrays; +import java.util.Vector; + +import org.testng.annotations.Test; + +public class EmblEntryTest +{ + @Test(groups = "Functional") + public void testGetCdsRanges() + { + EmblEntry testee = new EmblEntry(); + + /* + * Make a (CDS) Feature with 4 locations + */ + EmblFeature cds = new EmblFeature(); + Vector locs = new Vector(); + cds.setLocations(locs); + + /* + * single range [10-20] + */ + EmblFeatureLocations loc = new EmblFeatureLocations(); + loc.setLocationType("single"); + loc.setLocationComplement(false); + Vector elements = new Vector(); + EmblFeatureLocElement locElement = new EmblFeatureLocElement(); + BasePosition b1 = new BasePosition(); + b1.setPos("10"); + BasePosition b2 = new BasePosition(); + b2.setPos("20"); + locElement.setBasePositions(new BasePosition[] { b1, b2 }); + elements.add(locElement); + loc.setLocElements(elements); + locs.add(loc); + + /* + * complement range [30-40] + */ + loc = new EmblFeatureLocations(); + loc.setLocationType("single"); + loc.setLocationComplement(true); + elements = new Vector(); + locElement = new EmblFeatureLocElement(); + b1 = new BasePosition(); + b1.setPos("30"); + b2 = new BasePosition(); + b2.setPos("40"); + locElement.setBasePositions(new BasePosition[] { b1, b2 }); + elements.add(locElement); + loc.setLocElements(elements); + locs.add(loc); + + /* + * join range [50-60], [70-80] + */ + loc = new EmblFeatureLocations(); + loc.setLocationType("join"); + loc.setLocationComplement(false); + elements = new Vector(); + locElement = new EmblFeatureLocElement(); + b1 = new BasePosition(); + b1.setPos("50"); + b2 = new BasePosition(); + b2.setPos("60"); + locElement.setBasePositions(new BasePosition[] { b1, b2 }); + elements.add(locElement); + locElement = new EmblFeatureLocElement(); + b1 = new BasePosition(); + b1.setPos("70"); + b2 = new BasePosition(); + b2.setPos("80"); + locElement.setBasePositions(new BasePosition[] { b1, b2 }); + elements.add(locElement); + loc.setLocElements(elements); + locs.add(loc); + + /* + * complement range [90-100], [110-120] + * this should be the same as complement(join(90..100,110.120)) + * which is "join 90-100 and 110-120, then complement" + */ + loc = new EmblFeatureLocations(); + loc.setLocationType("join"); + loc.setLocationComplement(true); + elements = new Vector(); + locElement = new EmblFeatureLocElement(); + b1 = new BasePosition(); + b1.setPos("90"); + b2 = new BasePosition(); + b2.setPos("100"); + locElement.setBasePositions(new BasePosition[] { b1, b2 }); + elements.add(locElement); + locElement = new EmblFeatureLocElement(); + b1 = new BasePosition(); + b1.setPos("110"); + b2 = new BasePosition(); + b2.setPos("120"); + locElement.setBasePositions(new BasePosition[] { b1, b2 }); + elements.add(locElement); + loc.setLocElements(elements); + locs.add(loc); + + int[] exons = testee.getCdsRanges(cds); + assertEquals("[10, 20, 40, 30, 50, 60, 70, 80, 120, 110, 100, 90]", + Arrays.toString(exons)); + } + + @Test(groups = "Functional") + public void testGetCdsRanges_badData() + { + EmblEntry testee = new EmblEntry(); + + /* + * Make a (CDS) Feature with 4 locations + */ + EmblFeature cds = new EmblFeature(); + Vector locs = new Vector(); + cds.setLocations(locs); + + /* + * single range [10-20] + */ + EmblFeatureLocations loc = new EmblFeatureLocations(); + loc.setLocationType("single"); + loc.setLocationComplement(false); + Vector elements = new Vector(); + EmblFeatureLocElement locElement = new EmblFeatureLocElement(); + BasePosition b1 = new BasePosition(); + b1.setPos("10"); + BasePosition b2 = new BasePosition(); + b2.setPos("20"); + locElement.setBasePositions(new BasePosition[] { b1, b2 }); + elements.add(locElement); + loc.setLocElements(elements); + locs.add(loc); + + /* + * single range with missing end position - should be skipped + */ + loc = new EmblFeatureLocations(); + loc.setLocationType("single"); + loc.setLocationComplement(false); + elements = new Vector(); + locElement = new EmblFeatureLocElement(); + b1 = new BasePosition(); + b1.setPos("30"); + locElement.setBasePositions(new BasePosition[] { b1 }); + elements.add(locElement); + loc.setLocElements(elements); + locs.add(loc); + + /* + * single range with extra base position - should be skipped + */ + loc = new EmblFeatureLocations(); + loc.setLocationType("single"); + loc.setLocationComplement(false); + elements = new Vector(); + locElement = new EmblFeatureLocElement(); + b1 = new BasePosition(); + b1.setPos("30"); + locElement.setBasePositions(new BasePosition[] { b1, b1, b1 }); + elements.add(locElement); + loc.setLocElements(elements); + locs.add(loc); + + /* + * single valid range [50-60] to finish + */ + loc = new EmblFeatureLocations(); + loc.setLocationType("single"); + loc.setLocationComplement(false); + elements = new Vector(); + locElement = new EmblFeatureLocElement(); + b1 = new BasePosition(); + b1.setPos("50"); + b2 = new BasePosition(); + b2.setPos("60"); + locElement.setBasePositions(new BasePosition[] { b1, b2 }); + elements.add(locElement); + loc.setLocElements(elements); + locs.add(loc); + + int[] exons = testee.getCdsRanges(cds); + assertEquals("[10, 20, 50, 60]", Arrays.toString(exons)); + } + + /** + * Test retrieval of exon locations matching an accession id + */ + @Test(groups = "Functional") + public void testGetCdsRanges_forAccession() + { + EmblEntry testee = new EmblEntry(); + String accession = "A1234"; + testee.setAccession(accession); + /* + * Make a (CDS) Feature with 4 locations + */ + EmblFeature cds = new EmblFeature(); + Vector locs = new Vector(); + cds.setLocations(locs); + + /* + * single range [10-20] for 'this' accession + */ + EmblFeatureLocations loc = new EmblFeatureLocations(); + loc.setLocationType("single"); + loc.setLocationComplement(false); + Vector elements = new Vector(); + EmblFeatureLocElement locElement = new EmblFeatureLocElement(); + locElement.setAccession(accession); + BasePosition b1 = new BasePosition(); + b1.setPos("10"); + BasePosition b2 = new BasePosition(); + b2.setPos("20"); + locElement.setBasePositions(new BasePosition[] { b1, b2 }); + elements.add(locElement); + loc.setLocElements(elements); + locs.add(loc); + + /* + * complement range [30-40] - no accession + */ + loc = new EmblFeatureLocations(); + loc.setLocationType("single"); + loc.setLocationComplement(true); + elements = new Vector(); + locElement = new EmblFeatureLocElement(); + b1 = new BasePosition(); + b1.setPos("30"); + b2 = new BasePosition(); + b2.setPos("40"); + locElement.setBasePositions(new BasePosition[] { b1, b2 }); + elements.add(locElement); + loc.setLocElements(elements); + locs.add(loc); + + /* + * join range [50-60] this accession, [70-80] another + */ + loc = new EmblFeatureLocations(); + loc.setLocationType("join"); + loc.setLocationComplement(false); + elements = new Vector(); + locElement = new EmblFeatureLocElement(); + locElement.setAccession(accession); + b1 = new BasePosition(); + b1.setPos("50"); + b2 = new BasePosition(); + b2.setPos("60"); + locElement.setBasePositions(new BasePosition[] { b1, b2 }); + elements.add(locElement); + locElement = new EmblFeatureLocElement(); + locElement.setAccession("notme"); + b1 = new BasePosition(); + b1.setPos("70"); + b2 = new BasePosition(); + b2.setPos("80"); + locElement.setBasePositions(new BasePosition[] { b1, b2 }); + elements.add(locElement); + loc.setLocElements(elements); + locs.add(loc); + + /* + * complement range [90-100] wrong accession, [110-120] good + * this should be the same as complement(join(90..100,110.120)) + * which is "join 90-100 and 110-120, then complement" + */ + loc = new EmblFeatureLocations(); + loc.setLocationType("join"); + loc.setLocationComplement(true); + elements = new Vector(); + locElement = new EmblFeatureLocElement(); + locElement.setAccession("wrong"); + b1 = new BasePosition(); + b1.setPos("90"); + b2 = new BasePosition(); + b2.setPos("100"); + locElement.setBasePositions(new BasePosition[] { b1, b2 }); + elements.add(locElement); + locElement = new EmblFeatureLocElement(); + locElement.setAccession(accession); + b1 = new BasePosition(); + b1.setPos("110"); + b2 = new BasePosition(); + b2.setPos("120"); + locElement.setBasePositions(new BasePosition[] { b1, b2 }); + elements.add(locElement); + loc.setLocElements(elements); + locs.add(loc); + + /* + * verify we pick out only ranges for A1234 + */ + int[] exons = testee.getCdsRanges(cds); + assertEquals("[10, 20, 50, 60, 120, 110]", + Arrays.toString(exons)); + } +} diff --git a/test/jalview/util/ArrayUtilsTest.java b/test/jalview/util/ArrayUtilsTest.java new file mode 100644 index 0000000..5a2674a --- /dev/null +++ b/test/jalview/util/ArrayUtilsTest.java @@ -0,0 +1,31 @@ +package jalview.util; + +import static org.testng.AssertJUnit.assertEquals; + +import java.util.Arrays; + +import org.testng.annotations.Test; + +public class ArrayUtilsTest +{ + @Test(groups="Functional") + public void testReverseIntArray() { + + // null value: should be no exception + ArrayUtils.reverseIntArray((int[]) null); + + // empty array: should be no exception + int[] arr = new int[] {}; + ArrayUtils.reverseIntArray(arr); + + // even length array + arr = new int[] { 1, 2, 3, 4 }; + ArrayUtils.reverseIntArray(arr); + assertEquals("[4, 3, 2, 1]", Arrays.toString(arr)); + + // odd length array + arr = new int[] { 1, 2, 3, 4, 5 }; + ArrayUtils.reverseIntArray(arr); + assertEquals("[5, 4, 3, 2, 1]", Arrays.toString(arr)); + } +} diff --git a/test/jalview/util/MappingUtilsTest.java b/test/jalview/util/MappingUtilsTest.java index 853ebd5..b53d513 100644 --- a/test/jalview/util/MappingUtilsTest.java +++ b/test/jalview/util/MappingUtilsTest.java @@ -24,7 +24,6 @@ import static org.testng.AssertJUnit.assertEquals; import static org.testng.AssertJUnit.assertFalse; import static org.testng.AssertJUnit.assertSame; import static org.testng.AssertJUnit.assertTrue; -import static org.testng.internal.junit.ArrayAsserts.assertArrayEquals; import jalview.api.AlignViewportI; import jalview.commands.EditCommand; @@ -911,4 +910,107 @@ public class MappingUtilsTest assertFalse(MappingUtils.contains(ranges, -45)); } + /** + * Test the method that drops positions from the start of a mapped range + */ + @Test(groups = "Functional") + public void testRemoveStartPositions() + { + int[] ranges = new int[] { 1, 10 }; + int[] adjusted = MappingUtils.removeStartPositions(0, ranges); + assertEquals("[1, 10]", Arrays.toString(adjusted)); + + adjusted = MappingUtils.removeStartPositions(1, ranges); + assertEquals("[2, 10]", Arrays.toString(adjusted)); + assertEquals("[1, 10]", Arrays.toString(ranges)); + + ranges = adjusted; + adjusted = MappingUtils.removeStartPositions(1, ranges); + assertEquals("[3, 10]", Arrays.toString(adjusted)); + assertEquals("[2, 10]", Arrays.toString(ranges)); + + ranges = new int[] { 2, 3, 10, 12 }; + adjusted = MappingUtils.removeStartPositions(1, ranges); + assertEquals("[3, 3, 10, 12]", Arrays.toString(adjusted)); + assertEquals("[2, 3, 10, 12]", Arrays.toString(ranges)); + + ranges = new int[] { 2, 2, 8, 12 }; + adjusted = MappingUtils.removeStartPositions(1, ranges); + assertEquals("[8, 12]", Arrays.toString(adjusted)); + assertEquals("[2, 2, 8, 12]", Arrays.toString(ranges)); + + ranges = new int[] { 2, 2, 8, 12 }; + adjusted = MappingUtils.removeStartPositions(2, ranges); + assertEquals("[9, 12]", Arrays.toString(adjusted)); + assertEquals("[2, 2, 8, 12]", Arrays.toString(ranges)); + + ranges = new int[] { 2, 2, 4, 4, 9, 12 }; + adjusted = MappingUtils.removeStartPositions(1, ranges); + assertEquals("[4, 4, 9, 12]", Arrays.toString(adjusted)); + assertEquals("[2, 2, 4, 4, 9, 12]", Arrays.toString(ranges)); + + ranges = new int[] { 2, 2, 4, 4, 9, 12 }; + adjusted = MappingUtils.removeStartPositions(2, ranges); + assertEquals("[9, 12]", Arrays.toString(adjusted)); + assertEquals("[2, 2, 4, 4, 9, 12]", Arrays.toString(ranges)); + + ranges = new int[] { 2, 3, 9, 12 }; + adjusted = MappingUtils.removeStartPositions(3, ranges); + assertEquals("[10, 12]", Arrays.toString(adjusted)); + assertEquals("[2, 3, 9, 12]", Arrays.toString(ranges)); + } + + /** + * Test the method that drops positions from the start of a mapped range, on + * the reverse strand + */ + @Test(groups = "Functional") + public void testRemoveStartPositions_reverseStrand() + { + int[] ranges = new int[] { 10, 1 }; + int[] adjusted = MappingUtils.removeStartPositions(0, ranges); + assertEquals("[10, 1]", Arrays.toString(adjusted)); + assertEquals("[10, 1]", Arrays.toString(ranges)); + + ranges = adjusted; + adjusted = MappingUtils.removeStartPositions(1, ranges); + assertEquals("[9, 1]", Arrays.toString(adjusted)); + assertEquals("[10, 1]", Arrays.toString(ranges)); + + ranges = adjusted; + adjusted = MappingUtils.removeStartPositions(1, ranges); + assertEquals("[8, 1]", Arrays.toString(adjusted)); + assertEquals("[9, 1]", Arrays.toString(ranges)); + + ranges = new int[] { 12, 11, 9, 6 }; + adjusted = MappingUtils.removeStartPositions(1, ranges); + assertEquals("[11, 11, 9, 6]", Arrays.toString(adjusted)); + assertEquals("[12, 11, 9, 6]", Arrays.toString(ranges)); + + ranges = new int[] { 12, 12, 8, 4 }; + adjusted = MappingUtils.removeStartPositions(1, ranges); + assertEquals("[8, 4]", Arrays.toString(adjusted)); + assertEquals("[12, 12, 8, 4]", Arrays.toString(ranges)); + + ranges = new int[] { 12, 12, 8, 4 }; + adjusted = MappingUtils.removeStartPositions(2, ranges); + assertEquals("[7, 4]", Arrays.toString(adjusted)); + assertEquals("[12, 12, 8, 4]", Arrays.toString(ranges)); + + ranges = new int[] { 12, 12, 10, 10, 8, 4 }; + adjusted = MappingUtils.removeStartPositions(1, ranges); + assertEquals("[10, 10, 8, 4]", Arrays.toString(adjusted)); + assertEquals("[12, 12, 10, 10, 8, 4]", Arrays.toString(ranges)); + + ranges = new int[] { 12, 12, 10, 10, 8, 4 }; + adjusted = MappingUtils.removeStartPositions(2, ranges); + assertEquals("[8, 4]", Arrays.toString(adjusted)); + assertEquals("[12, 12, 10, 10, 8, 4]", Arrays.toString(ranges)); + + ranges = new int[] { 12, 11, 8, 4 }; + adjusted = MappingUtils.removeStartPositions(3, ranges); + assertEquals("[7, 4]", Arrays.toString(adjusted)); + assertEquals("[12, 11, 8, 4]", Arrays.toString(ranges)); + } + }