src/jalview/ws/dbsources/EmblXmlSource.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.ws.dbsources;
  22
  23 import java.util.Locale;
  24
  25 import java.io.File;
  26 import java.io.FileInputStream;
  27 import java.io.InputStream;
  28 import java.text.ParseException;
  29 import java.util.ArrayList;
  30 import java.util.Arrays;
  31 import java.util.Hashtable;
  32 import java.util.List;
  33 import java.util.Map;
  34 import java.util.Map.Entry;
  35
  36 import javax.xml.bind.JAXBContext;
  37 import javax.xml.bind.JAXBElement;
  38 import javax.xml.bind.JAXBException;
  39 import javax.xml.stream.FactoryConfigurationError;
  40 import javax.xml.stream.XMLInputFactory;
  41 import javax.xml.stream.XMLStreamException;
  42 import javax.xml.stream.XMLStreamReader;
  43
  44
  45 import jalview.analysis.SequenceIdMatcher;
  46 import jalview.bin.Console;
  47 import jalview.datamodel.Alignment;
  48 import jalview.datamodel.AlignmentI;
  49 import jalview.datamodel.DBRefEntry;
  50 import jalview.datamodel.DBRefSource;
  51 import jalview.datamodel.FeatureProperties;
  52 import jalview.datamodel.Mapping;
  53 import jalview.datamodel.Sequence;
  54 import jalview.datamodel.SequenceFeature;
  55 import jalview.datamodel.SequenceI;
  56 import jalview.util.DBRefUtils;
  57 import jalview.util.DnaUtils;
  58 import jalview.util.MapList;
  59 import jalview.util.MappingUtils;
  60 import jalview.util.MessageManager;
  61 import jalview.util.Platform;
  62 import jalview.ws.ebi.EBIFetchClient;
  63 import jalview.xml.binding.embl.EntryType;
  64 import jalview.xml.binding.embl.EntryType.Feature;
  65 import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
  66 import jalview.xml.binding.embl.ROOT;
  67 import jalview.xml.binding.embl.XrefType;
  68
  69 import com.stevesoft.pat.Regex;
  70
  71 /**
  72  * Provides XML binding and parsing of EMBL or EMBLCDS records retrieved from
  73  * (e.g.) {@code https://www.ebi.ac.uk/ena/data/view/x53828&display=xml}.
  74  *
  75  * @deprecated endpoint withdrawn August 2020 (JAL-3692), use EmblFlatfileSource
  76  */
  77 @Deprecated
  78 public abstract class EmblXmlSource extends EbiFileRetrievedProxy
  79 {
  80   // TODO: delete class or update tyhis validator for 2.12 style Platform.regex
  81     private static final Regex ACCESSION_REGEX = Platform.newRegex("^[A-Z]+[0-9]+");
  82
  83   /*
  84    * JAL-1856 Embl returns this text for query not found
  85    */
  86   private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found.";
  87
  88   public EmblXmlSource()
  89   {
  90     super();
  91   }
  92
  93   /**
  94    * Retrieves and parses an emblxml file, and returns an alignment containing
  95    * the parsed sequences, or null if none were found
  96    *
  97    * @param emprefx
  98    *          "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml
  99    * @param query
 100    * @return
 101    * @throws Exception
 102    */
 103   protected AlignmentI getEmblSequenceRecords(String emprefx, String query)
 104           throws Exception
 105   {
 106     startQuery();
 107     EBIFetchClient dbFetch = new EBIFetchClient();
 108     File reply;
 109     try
 110     {
 111       reply = dbFetch.fetchDataAsFile(
 112               emprefx.toLowerCase(Locale.ROOT) + ":" + query.trim(), "display=xml",
 113               "xml");
 114     } catch (Exception e)
 115     {
 116       stopQuery();
 117       throw new Exception(
 118               String.format("EBI EMBL XML retrieval failed for %s:%s",
 119                       emprefx.toLowerCase(Locale.ROOT), query.trim()),
 120               e);
 121     }
 122     return getEmblSequenceRecords(emprefx, query, reply);
 123   }
 124
 125   /**
 126    * parse an emblxml file stored locally
 127    *
 128    * @param emprefx
 129    *          either EMBL or EMBLCDS strings are allowed - anything else will
 130    *          not retrieve emblxml
 131    * @param query
 132    * @param file
 133    *          the EMBL XML file containing the results of a query
 134    * @return
 135    * @throws Exception
 136    */
 137   protected AlignmentI getEmblSequenceRecords(String emprefx, String query,
 138           File reply) throws Exception
 139   {
 140     List<EntryType> entries = null;
 141     if (reply != null && reply.exists())
 142     {
 143       file = reply.getAbsolutePath();
 144       if (reply.length() > EMBL_NOT_FOUND_REPLY.length())
 145       {
 146         InputStream is = new FileInputStream(reply);
 147         entries = getEmblEntries(is);
 148       }
 149     }
 150
 151     /*
 152      * invalid accession gets a reply with no <entry> elements, text content of
 153      * EmbFile reads something like (e.g.) this ungrammatical phrase
 154      * Entry: <acc> display type is either not supported or entry is not found.
 155      */
 156     AlignmentI al = null;
 157     List<SequenceI> seqs = new ArrayList<>();
 158     List<SequenceI> peptides = new ArrayList<>();
 159     if (entries != null)
 160     {
 161       for (EntryType entry : entries)
 162       {
 163         SequenceI seq = getSequence(emprefx, entry, peptides);
 164         if (seq != null)
 165         {
 166           seqs.add(seq.deriveSequence());
 167           // place DBReferences on dataset and refer
 168         }
 169       }
 170       if (!seqs.isEmpty())
 171       {
 172         al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
 173       }
 174       else
 175       {
 176         System.out.println(
 177                 "No record found for '" + emprefx + ":" + query + "'");
 178       }
 179     }
 180
 181     stopQuery();
 182     return al;
 183   }
 184
 185   /**
 186    * Reads the XML reply from file and unmarshals it to Java objects. Answers a
 187    * (possibly empty) list of <code>EntryType</code> objects.
 188    *
 189    * is
 190    *
 191    * @return
 192    */
 193   List<EntryType> getEmblEntries(InputStream is)
 194   {
 195     List<EntryType> entries = new ArrayList<>();
 196     try
 197     {
 198       JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl");
 199       XMLStreamReader streamReader = XMLInputFactory.newInstance()
 200               .createXMLStreamReader(is);
 201       javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
 202       JAXBElement<ROOT> rootElement = um.unmarshal(streamReader,
 203               ROOT.class);
 204       ROOT root = rootElement.getValue();
 205
 206       /*
 207        * document root contains either "entry" or "entrySet"
 208        */
 209       if (root == null)
 210       {
 211         return entries;
 212       }
 213       if (root.getEntrySet() != null)
 214       {
 215         entries = root.getEntrySet().getEntry();
 216       }
 217       else if (root.getEntry() != null)
 218       {
 219         entries.add(root.getEntry());
 220       }
 221     } catch (JAXBException | XMLStreamException
 222             | FactoryConfigurationError e)
 223     {
 224       e.printStackTrace();
 225     }
 226     return entries;
 227   }
 228
 229   /**
 230    * A helper method to parse XML data and construct a sequence, with any
 231    * available database references and features
 232    *
 233    * @param emprefx
 234    * @param entry
 235    * @param peptides
 236    * @return
 237    */
 238   SequenceI getSequence(String sourceDb, EntryType entry,
 239           List<SequenceI> peptides)
 240   {
 241     String seqString = entry.getSequence();
 242     if (seqString == null)
 243     {
 244       return null;
 245     }
 246     seqString = seqString.replace(" ", "").replace("\n", "").replace("\t",
 247             "");
 248     String accession = entry.getAccession();
 249     SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString);
 250
 251     dna.setDescription(entry.getDescription());
 252     String sequenceVersion = String.valueOf(entry.getVersion().intValue());
 253     DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion,
 254             accession);
 255     dna.addDBRef(selfRref);
 256     selfRref.setMap(
 257             new Mapping(null, new int[]
 258             { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
 259                     1));
 260
 261     /*
 262      * add db references
 263      */
 264     List<XrefType> xrefs = entry.getXref();
 265     if (xrefs != null)
 266     {
 267       for (XrefType xref : xrefs)
 268       {
 269         String acc = xref.getId();
 270         String source = DBRefUtils.getCanonicalName(xref.getDb());
 271         String version = xref.getSecondaryId();
 272         if (version == null || "".equals(version))
 273         {
 274           version = "0";
 275         }
 276         dna.addDBRef(new DBRefEntry(source, version, acc));
 277       }
 278     }
 279
 280     SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
 281     try
 282     {
 283       List<Feature> features = entry.getFeature();
 284       if (features != null)
 285       {
 286         for (Feature feature : features)
 287         {
 288           if (FeatureProperties.isCodingFeature(sourceDb,
 289                   feature.getName()))
 290           {
 291             parseCodingFeature(entry, feature, sourceDb, dna, peptides,
 292                     matcher);
 293           }
 294         }
 295       }
 296     } catch (Exception e)
 297     {
 298       System.err.println("EMBL Record Features parsing error!");
 299       System.err
 300               .println("Please report the following to help@jalview.org :");
 301       System.err.println("EMBL Record " + accession);
 302       System.err.println("Resulted in exception: " + e.getMessage());
 303       e.printStackTrace(System.err);
 304     }
 305
 306     return dna;
 307   }
 308
 309   /**
 310    * Extracts coding region and product from a CDS feature and decorates it with
 311    * annotations
 312    *
 313    * @param entry
 314    * @param feature
 315    * @param sourceDb
 316    * @param dna
 317    * @param peptides
 318    * @param matcher
 319    */
 320   void parseCodingFeature(EntryType entry, Feature feature, String sourceDb,
 321           SequenceI dna, List<SequenceI> peptides,
 322           SequenceIdMatcher matcher)
 323   {
 324     final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
 325     final String accession = entry.getAccession();
 326     final String sequenceVersion = entry.getVersion().toString();
 327
 328     int[] exons = getCdsRanges(entry.getAccession(), feature);
 329
 330     String translation = null;
 331     String proteinName = "";
 332     String proteinId = null;
 333     Map<String, String> vals = new Hashtable<>();
 334
 335     /*
 336      * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
 337      * (phase is required for CDS features in GFF3 format)
 338      */
 339     int codonStart = 1;
 340
 341     /*
 342      * parse qualifiers, saving protein translation, protein id,
 343      * codon start position, product (name), and 'other values'
 344      */
 345     if (feature.getQualifier() != null)
 346     {
 347       for (Qualifier q : feature.getQualifier())
 348       {
 349         String qname = q.getName();
 350         String value = q.getValue();
 351         value = value == null ? ""
 352                 : value.trim().replace(" ", "").replace("\n", "")
 353                         .replace("\t", "");
 354         if (qname.equals("translation"))
 355         {
 356           translation = value;
 357         }
 358         else if (qname.equals("protein_id"))
 359         {
 360           proteinId = value;
 361         }
 362         else if (qname.equals("codon_start"))
 363         {
 364           try
 365           {
 366             codonStart = Integer.parseInt(value.trim());
 367           } catch (NumberFormatException e)
 368           {
 369             System.err.println("Invalid codon_start in XML for "
 370                     + entry.getAccession() + ": " + e.getMessage());
 371           }
 372         }
 373         else if (qname.equals("product"))
 374         {
 375           // sometimes name is returned e.g. for V00488
 376           proteinName = value;
 377         }
 378         else
 379         {
 380           // throw anything else into the additional properties hash
 381           if (!"".equals(value))
 382           {
 383             vals.put(qname, value);
 384           }
 385         }
 386       }
 387     }
 388
 389     DBRefEntry proteinToEmblProteinRef = null;
 390     exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
 391
 392     SequenceI product = null;
 393     Mapping dnaToProteinMapping = null;
 394     if (translation != null && proteinName != null && proteinId != null)
 395     {
 396       int translationLength = translation.length();
 397
 398       /*
 399        * look for product in peptides list, if not found, add it
 400        */
 401       product = matcher.findIdMatch(proteinId);
 402       if (product == null)
 403       {
 404         product = new Sequence(proteinId, translation, 1,
 405                 translationLength);
 406         product.setDescription(((proteinName.length() == 0)
 407                 ? "Protein Product from " + sourceDb
 408                 : proteinName));
 409         peptides.add(product);
 410         matcher.add(product);
 411       }
 412
 413       // we have everything - create the mapping and perhaps the protein
 414       // sequence
 415       if (exons == null || exons.length == 0)
 416       {
 417         /*
 418          * workaround until we handle dna location for CDS sequence
 419          * e.g. location="X53828.1:60..1058" correctly
 420          */
 421         System.err.println(
 422                 "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
 423                         + sourceDb + ":" + entry.getAccession() + ")");
 424         int dnaLength = dna.getLength();
 425         if (translationLength * 3 == (1 - codonStart + dnaLength))
 426         {
 427           System.err.println(
 428                   "Not allowing for additional stop codon at end of cDNA fragment... !");
 429           // this might occur for CDS sequences where no features are marked
 430           exons = new int[] { dna.getStart() + (codonStart - 1),
 431               dna.getEnd() };
 432           dnaToProteinMapping = new Mapping(product, exons,
 433                   new int[]
 434                   { 1, translationLength }, 3, 1);
 435         }
 436         if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
 437         {
 438           System.err.println(
 439                   "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
 440           exons = new int[] { dna.getStart() + (codonStart - 1),
 441               dna.getEnd() - 3 };
 442           dnaToProteinMapping = new Mapping(product, exons,
 443                   new int[]
 444                   { 1, translationLength }, 3, 1);
 445         }
 446       }
 447       else
 448       {
 449         // Trim the exon mapping if necessary - the given product may only be a
 450         // fragment of a larger protein. (EMBL:AY043181 is an example)
 451
 452         if (isEmblCdna)
 453         {
 454           // TODO: Add a DbRef back to the parent EMBL sequence with the exon
 455           // map
 456           // if given a dataset reference, search dataset for parent EMBL
 457           // sequence if it exists and set its map
 458           // make a new feature annotating the coding contig
 459         }
 460         else
 461         {
 462           // final product length truncation check
 463           int[] cdsRanges = adjustForProteinLength(translationLength,
 464                   exons);
 465           dnaToProteinMapping = new Mapping(product, cdsRanges,
 466                   new int[]
 467                   { 1, translationLength }, 3, 1);
 468           if (product != null)
 469           {
 470             /*
 471              * make xref with mapping from protein to EMBL dna
 472              */
 473             DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
 474                     sequenceVersion, proteinId,
 475                     new Mapping(dnaToProteinMapping.getMap().getInverse()));
 476             product.addDBRef(proteinToEmblRef);
 477
 478             /*
 479              * make xref from protein to EMBLCDS; we assume here that the
 480              * CDS sequence version is same as dna sequence (?!)
 481              */
 482             MapList proteinToCdsMapList = new MapList(
 483                     new int[]
 484                     { 1, translationLength },
 485                     new int[]
 486                     { 1 + (codonStart - 1),
 487                         (codonStart - 1) + 3 * translationLength },
 488                     1, 3);
 489             DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
 490                     DBRefSource.EMBLCDS, sequenceVersion, proteinId,
 491                     new Mapping(proteinToCdsMapList));
 492             product.addDBRef(proteinToEmblCdsRef);
 493
 494             /*
 495              * make 'direct' xref from protein to EMBLCDSPROTEIN
 496              */
 497             proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
 498             proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
 499             proteinToEmblProteinRef.setMap(null);
 500             product.addDBRef(proteinToEmblProteinRef);
 501           }
 502         }
 503       }
 504
 505       /*
 506        * add cds features to dna sequence
 507        */
 508       String cds = feature.getName(); // "CDS"
 509       for (int xint = 0; exons != null
 510               && xint < exons.length - 1; xint += 2)
 511       {
 512         int exonStart = exons[xint];
 513         int exonEnd = exons[xint + 1];
 514         int begin = Math.min(exonStart, exonEnd);
 515         int end = Math.max(exonStart, exonEnd);
 516         int exonNumber = xint / 2 + 1;
 517         String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
 518                 exonNumber, proteinName, proteinId);
 519
 520         SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb,
 521                 vals);
 522
 523         sf.setEnaLocation(feature.getLocation());
 524         boolean forwardStrand = exonStart <= exonEnd;
 525         sf.setStrand(forwardStrand ? "+" : "-");
 526         sf.setPhase(String.valueOf(codonStart - 1));
 527         sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 528         sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
 529
 530         dna.addSequenceFeature(sf);
 531       }
 532     }
 533
 534     /*
 535      * add feature dbRefs to sequence, and mappings for Uniprot xrefs
 536      */
 537     boolean hasUniprotDbref = false;
 538     List<XrefType> xrefs = feature.getXref();
 539     if (xrefs != null)
 540     {
 541       boolean mappingUsed = false;
 542       for (XrefType xref : xrefs)
 543       {
 544         /*
 545          * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 546          */
 547         String source = DBRefUtils.getCanonicalName(xref.getDb());
 548         String version = xref.getSecondaryId();
 549         if (version == null || "".equals(version))
 550         {
 551           version = "0";
 552         }
 553         DBRefEntry dbref = new DBRefEntry(source, version, xref.getId());
 554         DBRefEntry proteinDbRef = new DBRefEntry(source, version,
 555                 dbref.getAccessionId());
 556         if (source.equals(DBRefSource.UNIPROT))
 557         {
 558           String proteinSeqName = DBRefSource.UNIPROT + "|"
 559                   + dbref.getAccessionId();
 560           if (dnaToProteinMapping != null
 561                   && dnaToProteinMapping.getTo() != null)
 562           {
 563             if (mappingUsed)
 564             {
 565               /*
 566                * two or more Uniprot xrefs for the same CDS -
 567                * each needs a distinct Mapping (as to a different sequence)
 568                */
 569               dnaToProteinMapping = new Mapping(dnaToProteinMapping);
 570             }
 571             mappingUsed = true;
 572
 573             /*
 574              * try to locate the protein mapped to (possibly by a
 575              * previous CDS feature); if not found, construct it from
 576              * the EMBL translation
 577              */
 578             SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
 579             if (proteinSeq == null)
 580             {
 581               proteinSeq = new Sequence(proteinSeqName,
 582                       product.getSequenceAsString());
 583               matcher.add(proteinSeq);
 584               proteinSeq.setDescription(product.getDescription());
 585               peptides.add(proteinSeq);
 586             }
 587             dnaToProteinMapping.setTo(proteinSeq);
 588             dnaToProteinMapping.setMappedFromId(proteinId);
 589             proteinSeq.addDBRef(proteinDbRef);
 590             dbref.setMap(dnaToProteinMapping);
 591           }
 592           hasUniprotDbref = true;
 593         }
 594         if (product != null)
 595         {
 596           /*
 597            * copy feature dbref to our protein product
 598            */
 599           DBRefEntry pref = proteinDbRef;
 600           pref.setMap(null); // reference is direct
 601           product.addDBRef(pref);
 602           // Add converse mapping reference
 603           if (dnaToProteinMapping != null)
 604           {
 605             Mapping pmap = new Mapping(dna,
 606                     dnaToProteinMapping.getMap().getInverse());
 607             pref = new DBRefEntry(sourceDb, sequenceVersion, accession);
 608             pref.setMap(pmap);
 609             if (dnaToProteinMapping.getTo() != null)
 610             {
 611               dnaToProteinMapping.getTo().addDBRef(pref);
 612             }
 613           }
 614         }
 615         dna.addDBRef(dbref);
 616       }
 617     }
 618
 619     /*
 620      * if we have a product (translation) but no explicit Uniprot dbref
 621      * (example: EMBL AAFI02000057 protein_id EAL65544.1)
 622      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 623      */
 624     if (!hasUniprotDbref && product != null)
 625     {
 626       if (proteinToEmblProteinRef == null)
 627       {
 628         // assuming CDSPROTEIN sequence version = dna version (?!)
 629         proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 630                 sequenceVersion, proteinId);
 631       }
 632       product.addDBRef(proteinToEmblProteinRef);
 633
 634       if (dnaToProteinMapping != null
 635               && dnaToProteinMapping.getTo() != null)
 636       {
 637         DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 638                 DBRefSource.EMBLCDSProduct, sequenceVersion, proteinId);
 639         dnaToEmblProteinRef.setMap(dnaToProteinMapping);
 640         dnaToProteinMapping.setMappedFromId(proteinId);
 641         dna.addDBRef(dnaToEmblProteinRef);
 642       }
 643     }
 644   }
 645
 646   @Override
 647   public boolean isDnaCoding()
 648   {
 649     return true;
 650   }
 651
 652   /**
 653    * Returns the CDS positions as a single array of [start, end, start, end...]
 654    * positions. If on the reverse strand, these will be in descending order.
 655    *
 656    * @param accession
 657    * @param feature
 658    * @return
 659    */
 660   protected int[] getCdsRanges(String accession, Feature feature)
 661   {
 662     String location = feature.getLocation();
 663     if (location == null)
 664     {
 665       return new int[] {};
 666     }
 667
 668     try
 669     {
 670       List<int[]> ranges = DnaUtils.parseLocation(location);
 671       return listToArray(ranges);
 672     } catch (ParseException e)
 673     {
 674       Console.warn(
 675               String.format("Not parsing inexact CDS location %s in ENA %s",
 676                       location, accession));
 677       return new int[] {};
 678     }
 679   }
 680
 681   /**
 682    * Converts a list of [start, end] ranges to a single array of [start, end,
 683    * start, end ...]
 684    *
 685    * @param ranges
 686    * @return
 687    */
 688   int[] listToArray(List<int[]> ranges)
 689   {
 690     int[] result = new int[ranges.size() * 2];
 691     int i = 0;
 692     for (int[] range : ranges)
 693     {
 694       result[i++] = range[0];
 695       result[i++] = range[1];
 696     }
 697     return result;
 698   }
 699
 700   /**
 701    * Helper method to construct a SequenceFeature for one cds range
 702    *
 703    * @param type
 704    *          feature type ("CDS")
 705    * @param desc
 706    *          description
 707    * @param begin
 708    *          start position
 709    * @param end
 710    *          end position
 711    * @param group
 712    *          feature group
 713    * @param vals
 714    *          map of 'miscellaneous values' for feature
 715    * @return
 716    */
 717   protected SequenceFeature makeCdsFeature(String type, String desc,
 718           int begin, int end, String group, Map<String, String> vals)
 719   {
 720     SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
 721     if (!vals.isEmpty())
 722     {
 723       for (Entry<String, String> val : vals.entrySet())
 724       {
 725         sf.setValue(val.getKey(), val.getValue());
 726       }
 727     }
 728     return sf;
 729   }
 730
 731   @Override
 732   public String getAccessionSeparator()
 733   {
 734     return null;
 735   }
 736
 737   @Override
 738   public Regex getAccessionValidator()
 739   {
 740     return ACCESSION_REGEX;
 741   }
 742
 743   @Override
 744   public String getDbVersion()
 745   {
 746     return "0";
 747   }
 748
 749   @Override
 750   public int getTier()
 751   {
 752     return 0;
 753   }
 754
 755   @Override
 756   public boolean isValidReference(String accession)
 757   {
 758     if (accession == null || accession.length() < 2)
 759     {
 760       return false;
 761     }
 762     return getAccessionValidator().search(accession);
 763   }
 764
 765   /**
 766    * Truncates (if necessary) the exon intervals to match 3 times the length of
 767    * the protein (including truncation for stop codon included in exon)
 768    *
 769    * @param proteinLength
 770    * @param exon
 771    *          an array of [start, end, start, end...] intervals
 772    * @return the same array (if unchanged) or a truncated copy
 773    */
 774   static int[] adjustForProteinLength(int proteinLength, int[] exon)
 775   {
 776     if (proteinLength <= 0 || exon == null)
 777     {
 778       return exon;
 779     }
 780     int expectedCdsLength = proteinLength * 3;
 781     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 782
 783     /*
 784      * if exon length matches protein, or is shorter, then leave it unchanged
 785      */
 786     if (expectedCdsLength >= exonLength)
 787     {
 788       return exon;
 789     }
 790
 791     int origxon[];
 792     int sxpos = -1;
 793     int endxon = 0;
 794     origxon = new int[exon.length];
 795     System.arraycopy(exon, 0, origxon, 0, exon.length);
 796     int cdspos = 0;
 797     for (int x = 0; x < exon.length; x += 2)
 798     {
 799       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 800       if (expectedCdsLength <= cdspos)
 801       {
 802         // advanced beyond last codon.
 803         sxpos = x;
 804         if (expectedCdsLength != cdspos)
 805         {
 806           // System.err
 807           // .println("Truncating final exon interval on region by "
 808           // + (cdspos - cdslength));
 809         }
 810
 811         /*
 812          * shrink the final exon - reduce end position if forward
 813          * strand, increase it if reverse
 814          */
 815         if (exon[x + 1] >= exon[x])
 816         {
 817           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 818         }
 819         else
 820         {
 821           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 822         }
 823         break;
 824       }
 825     }
 826
 827     if (sxpos != -1)
 828     {
 829       // and trim the exon interval set if necessary
 830       int[] nxon = new int[sxpos + 2];
 831       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 832       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 833                                 // set
 834       exon = nxon;
 835     }
 836     return exon;
 837   }
 838
 839 }