src/jalview/ws/dbsources/EmblXmlSource.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.ws.dbsources;
  22
  23 import java.util.Locale;
  24
  25 import java.io.File;
  26 import java.io.FileInputStream;
  27 import java.io.InputStream;
  28 import java.text.ParseException;
  29 import java.util.ArrayList;
  30 import java.util.Arrays;
  31 import java.util.Hashtable;
  32 import java.util.List;
  33 import java.util.Map;
  34 import java.util.Map.Entry;
  35
  36 import javax.xml.bind.JAXBContext;
  37 import javax.xml.bind.JAXBElement;
  38 import javax.xml.bind.JAXBException;
  39 import javax.xml.stream.FactoryConfigurationError;
  40 import javax.xml.stream.XMLInputFactory;
  41 import javax.xml.stream.XMLStreamException;
  42 import javax.xml.stream.XMLStreamReader;
  43
  44 import jalview.analysis.SequenceIdMatcher;
  45 import jalview.bin.Cache;
  46 import jalview.datamodel.Alignment;
  47 import jalview.datamodel.AlignmentI;
  48 import jalview.datamodel.DBRefEntry;
  49 import jalview.datamodel.DBRefSource;
  50 import jalview.datamodel.FeatureProperties;
  51 import jalview.datamodel.Mapping;
  52 import jalview.datamodel.Sequence;
  53 import jalview.datamodel.SequenceFeature;
  54 import jalview.datamodel.SequenceI;
  55 import jalview.util.DBRefUtils;
  56 import jalview.util.DnaUtils;
  57 import jalview.util.MapList;
  58 import jalview.util.MappingUtils;
  59 import jalview.util.MessageManager;
  60 import jalview.ws.ebi.EBIFetchClient;
  61 import jalview.xml.binding.embl.EntryType;
  62 import jalview.xml.binding.embl.EntryType.Feature;
  63 import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
  64 import jalview.xml.binding.embl.ROOT;
  65 import jalview.xml.binding.embl.XrefType;
  66
  67 /**
  68  * Provides XML binding and parsing of EMBL or EMBLCDS records retrieved from
  69  * (e.g.) {@code https://www.ebi.ac.uk/ena/data/view/x53828&display=xml}.
  70  *
  71  * @deprecated endpoint withdrawn August 2020 (JAL-3692), use EmblFlatfileSource
  72  */
  73
  74 public abstract class EmblXmlSource extends EbiFileRetrievedProxy
  75 {
  76   // TODO: delete class or update tyhis validator for 2.12 style Platform.regex
  77   private static final Regex ACCESSION_REGEX = new Regex("^[A-Z]+[0-9]+");
  78   /*
  79    * JAL-1856 Embl returns this text for query not found
  80    */
  81   private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found.";
  82
  83   public EmblXmlSource()
  84   {
  85     super();
  86   }
  87
  88   /**
  89    * Retrieves and parses an emblxml file, and returns an alignment containing
  90    * the parsed sequences, or null if none were found
  91    *
  92    * @param emprefx
  93    *          "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml
  94    * @param query
  95    * @return
  96    * @throws Exception
  97    */
  98   protected AlignmentI getEmblSequenceRecords(String emprefx, String query)
  99           throws Exception
 100   {
 101     startQuery();
 102     EBIFetchClient dbFetch = new EBIFetchClient();
 103     File reply;
 104     try
 105     {
 106       reply = dbFetch.fetchDataAsFile(
 107               emprefx.toLowerCase(Locale.ROOT) + ":" + query.trim(), "display=xml",
 108               "xml");
 109     } catch (Exception e)
 110     {
 111       stopQuery();
 112       throw new Exception(
 113               String.format("EBI EMBL XML retrieval failed for %s:%s",
 114                       emprefx.toLowerCase(Locale.ROOT), query.trim()),
 115               e);
 116     }
 117     return getEmblSequenceRecords(emprefx, query, reply);
 118   }
 119
 120   /**
 121    * parse an emblxml file stored locally
 122    *
 123    * @param emprefx
 124    *          either EMBL or EMBLCDS strings are allowed - anything else will
 125    *          not retrieve emblxml
 126    * @param query
 127    * @param file
 128    *          the EMBL XML file containing the results of a query
 129    * @return
 130    * @throws Exception
 131    */
 132   protected AlignmentI getEmblSequenceRecords(String emprefx, String query,
 133           File reply) throws Exception
 134   {
 135     List<EntryType> entries = null;
 136     if (reply != null && reply.exists())
 137     {
 138       file = reply.getAbsolutePath();
 139       if (reply.length() > EMBL_NOT_FOUND_REPLY.length())
 140       {
 141         InputStream is = new FileInputStream(reply);
 142         entries = getEmblEntries(is);
 143       }
 144     }
 145
 146     /*
 147      * invalid accession gets a reply with no <entry> elements, text content of
 148      * EmbFile reads something like (e.g.) this ungrammatical phrase
 149      * Entry: <acc> display type is either not supported or entry is not found.
 150      */
 151     AlignmentI al = null;
 152     List<SequenceI> seqs = new ArrayList<>();
 153     List<SequenceI> peptides = new ArrayList<>();
 154     if (entries != null)
 155     {
 156       for (EntryType entry : entries)
 157       {
 158         SequenceI seq = getSequence(emprefx, entry, peptides);
 159         if (seq != null)
 160         {
 161           seqs.add(seq.deriveSequence());
 162           // place DBReferences on dataset and refer
 163         }
 164       }
 165       if (!seqs.isEmpty())
 166       {
 167         al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
 168       }
 169       else
 170       {
 171         System.out.println(
 172                 "No record found for '" + emprefx + ":" + query + "'");
 173       }
 174     }
 175
 176     stopQuery();
 177     return al;
 178   }
 179
 180   /**
 181    * Reads the XML reply from file and unmarshals it to Java objects. Answers a
 182    * (possibly empty) list of <code>EntryType</code> objects.
 183    *
 184    * is
 185    *
 186    * @return
 187    */
 188   List<EntryType> getEmblEntries(InputStream is)
 189   {
 190     List<EntryType> entries = new ArrayList<>();
 191     try
 192     {
 193       JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl");
 194       XMLStreamReader streamReader = XMLInputFactory.newInstance()
 195               .createXMLStreamReader(is);
 196       javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
 197       JAXBElement<ROOT> rootElement = um.unmarshal(streamReader,
 198               ROOT.class);
 199       ROOT root = rootElement.getValue();
 200
 201       /*
 202        * document root contains either "entry" or "entrySet"
 203        */
 204       if (root == null)
 205       {
 206         return entries;
 207       }
 208       if (root.getEntrySet() != null)
 209       {
 210         entries = root.getEntrySet().getEntry();
 211       }
 212       else if (root.getEntry() != null)
 213       {
 214         entries.add(root.getEntry());
 215       }
 216     } catch (JAXBException | XMLStreamException
 217             | FactoryConfigurationError e)
 218     {
 219       e.printStackTrace();
 220     }
 221     return entries;
 222   }
 223
 224   /**
 225    * A helper method to parse XML data and construct a sequence, with any
 226    * available database references and features
 227    *
 228    * @param emprefx
 229    * @param entry
 230    * @param peptides
 231    * @return
 232    */
 233   SequenceI getSequence(String sourceDb, EntryType entry,
 234           List<SequenceI> peptides)
 235   {
 236     String seqString = entry.getSequence();
 237     if (seqString == null)
 238     {
 239       return null;
 240     }
 241     seqString = seqString.replace(" ", "").replace("\n", "").replace("\t",
 242             "");
 243     String accession = entry.getAccession();
 244     SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString);
 245
 246     dna.setDescription(entry.getDescription());
 247     String sequenceVersion = String.valueOf(entry.getVersion().intValue());
 248     DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion,
 249             accession);
 250     dna.addDBRef(selfRref);
 251     selfRref.setMap(
 252             new Mapping(null, new int[]
 253             { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
 254                     1));
 255
 256     /*
 257      * add db references
 258      */
 259     List<XrefType> xrefs = entry.getXref();
 260     if (xrefs != null)
 261     {
 262       for (XrefType xref : xrefs)
 263       {
 264         String acc = xref.getId();
 265         String source = DBRefUtils.getCanonicalName(xref.getDb());
 266         String version = xref.getSecondaryId();
 267         if (version == null || "".equals(version))
 268         {
 269           version = "0";
 270         }
 271         dna.addDBRef(new DBRefEntry(source, version, acc));
 272       }
 273     }
 274
 275     SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
 276     try
 277     {
 278       List<Feature> features = entry.getFeature();
 279       if (features != null)
 280       {
 281         for (Feature feature : features)
 282         {
 283           if (FeatureProperties.isCodingFeature(sourceDb,
 284                   feature.getName()))
 285           {
 286             parseCodingFeature(entry, feature, sourceDb, dna, peptides,
 287                     matcher);
 288           }
 289         }
 290       }
 291     } catch (Exception e)
 292     {
 293       System.err.println("EMBL Record Features parsing error!");
 294       System.err
 295               .println("Please report the following to help@jalview.org :");
 296       System.err.println("EMBL Record " + accession);
 297       System.err.println("Resulted in exception: " + e.getMessage());
 298       e.printStackTrace(System.err);
 299     }
 300
 301     return dna;
 302   }
 303
 304   /**
 305    * Extracts coding region and product from a CDS feature and decorates it with
 306    * annotations
 307    *
 308    * @param entry
 309    * @param feature
 310    * @param sourceDb
 311    * @param dna
 312    * @param peptides
 313    * @param matcher
 314    */
 315   void parseCodingFeature(EntryType entry, Feature feature, String sourceDb,
 316           SequenceI dna, List<SequenceI> peptides,
 317           SequenceIdMatcher matcher)
 318   {
 319     final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
 320     final String accession = entry.getAccession();
 321     final String sequenceVersion = entry.getVersion().toString();
 322
 323     int[] exons = getCdsRanges(entry.getAccession(), feature);
 324
 325     String translation = null;
 326     String proteinName = "";
 327     String proteinId = null;
 328     Map<String, String> vals = new Hashtable<>();
 329
 330     /*
 331      * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
 332      * (phase is required for CDS features in GFF3 format)
 333      */
 334     int codonStart = 1;
 335
 336     /*
 337      * parse qualifiers, saving protein translation, protein id,
 338      * codon start position, product (name), and 'other values'
 339      */
 340     if (feature.getQualifier() != null)
 341     {
 342       for (Qualifier q : feature.getQualifier())
 343       {
 344         String qname = q.getName();
 345         String value = q.getValue();
 346         value = value == null ? ""
 347                 : value.trim().replace(" ", "").replace("\n", "")
 348                         .replace("\t", "");
 349         if (qname.equals("translation"))
 350         {
 351           translation = value;
 352         }
 353         else if (qname.equals("protein_id"))
 354         {
 355           proteinId = value;
 356         }
 357         else if (qname.equals("codon_start"))
 358         {
 359           try
 360           {
 361             codonStart = Integer.parseInt(value.trim());
 362           } catch (NumberFormatException e)
 363           {
 364             System.err.println("Invalid codon_start in XML for "
 365                     + entry.getAccession() + ": " + e.getMessage());
 366           }
 367         }
 368         else if (qname.equals("product"))
 369         {
 370           // sometimes name is returned e.g. for V00488
 371           proteinName = value;
 372         }
 373         else
 374         {
 375           // throw anything else into the additional properties hash
 376           if (!"".equals(value))
 377           {
 378             vals.put(qname, value);
 379           }
 380         }
 381       }
 382     }
 383
 384     DBRefEntry proteinToEmblProteinRef = null;
 385     exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
 386
 387     SequenceI product = null;
 388     Mapping dnaToProteinMapping = null;
 389     if (translation != null && proteinName != null && proteinId != null)
 390     {
 391       int translationLength = translation.length();
 392
 393       /*
 394        * look for product in peptides list, if not found, add it
 395        */
 396       product = matcher.findIdMatch(proteinId);
 397       if (product == null)
 398       {
 399         product = new Sequence(proteinId, translation, 1,
 400                 translationLength);
 401         product.setDescription(((proteinName.length() == 0)
 402                 ? "Protein Product from " + sourceDb
 403                 : proteinName));
 404         peptides.add(product);
 405         matcher.add(product);
 406       }
 407
 408       // we have everything - create the mapping and perhaps the protein
 409       // sequence
 410       if (exons == null || exons.length == 0)
 411       {
 412         /*
 413          * workaround until we handle dna location for CDS sequence
 414          * e.g. location="X53828.1:60..1058" correctly
 415          */
 416         System.err.println(
 417                 "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
 418                         + sourceDb + ":" + entry.getAccession() + ")");
 419         int dnaLength = dna.getLength();
 420         if (translationLength * 3 == (1 - codonStart + dnaLength))
 421         {
 422           System.err.println(
 423                   "Not allowing for additional stop codon at end of cDNA fragment... !");
 424           // this might occur for CDS sequences where no features are marked
 425           exons = new int[] { dna.getStart() + (codonStart - 1),
 426               dna.getEnd() };
 427           dnaToProteinMapping = new Mapping(product, exons,
 428                   new int[]
 429                   { 1, translationLength }, 3, 1);
 430         }
 431         if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
 432         {
 433           System.err.println(
 434                   "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
 435           exons = new int[] { dna.getStart() + (codonStart - 1),
 436               dna.getEnd() - 3 };
 437           dnaToProteinMapping = new Mapping(product, exons,
 438                   new int[]
 439                   { 1, translationLength }, 3, 1);
 440         }
 441       }
 442       else
 443       {
 444         // Trim the exon mapping if necessary - the given product may only be a
 445         // fragment of a larger protein. (EMBL:AY043181 is an example)
 446
 447         if (isEmblCdna)
 448         {
 449           // TODO: Add a DbRef back to the parent EMBL sequence with the exon
 450           // map
 451           // if given a dataset reference, search dataset for parent EMBL
 452           // sequence if it exists and set its map
 453           // make a new feature annotating the coding contig
 454         }
 455         else
 456         {
 457           // final product length truncation check
 458           int[] cdsRanges = adjustForProteinLength(translationLength,
 459                   exons);
 460           dnaToProteinMapping = new Mapping(product, cdsRanges,
 461                   new int[]
 462                   { 1, translationLength }, 3, 1);
 463           if (product != null)
 464           {
 465             /*
 466              * make xref with mapping from protein to EMBL dna
 467              */
 468             DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
 469                     sequenceVersion, proteinId,
 470                     new Mapping(dnaToProteinMapping.getMap().getInverse()));
 471             product.addDBRef(proteinToEmblRef);
 472
 473             /*
 474              * make xref from protein to EMBLCDS; we assume here that the
 475              * CDS sequence version is same as dna sequence (?!)
 476              */
 477             MapList proteinToCdsMapList = new MapList(
 478                     new int[]
 479                     { 1, translationLength },
 480                     new int[]
 481                     { 1 + (codonStart - 1),
 482                         (codonStart - 1) + 3 * translationLength },
 483                     1, 3);
 484             DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
 485                     DBRefSource.EMBLCDS, sequenceVersion, proteinId,
 486                     new Mapping(proteinToCdsMapList));
 487             product.addDBRef(proteinToEmblCdsRef);
 488
 489             /*
 490              * make 'direct' xref from protein to EMBLCDSPROTEIN
 491              */
 492             proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
 493             proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
 494             proteinToEmblProteinRef.setMap(null);
 495             product.addDBRef(proteinToEmblProteinRef);
 496           }
 497         }
 498       }
 499
 500       /*
 501        * add cds features to dna sequence
 502        */
 503       String cds = feature.getName(); // "CDS"
 504       for (int xint = 0; exons != null
 505               && xint < exons.length - 1; xint += 2)
 506       {
 507         int exonStart = exons[xint];
 508         int exonEnd = exons[xint + 1];
 509         int begin = Math.min(exonStart, exonEnd);
 510         int end = Math.max(exonStart, exonEnd);
 511         int exonNumber = xint / 2 + 1;
 512         String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
 513                 exonNumber, proteinName, proteinId);
 514
 515         SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb,
 516                 vals);
 517
 518         sf.setEnaLocation(feature.getLocation());
 519         boolean forwardStrand = exonStart <= exonEnd;
 520         sf.setStrand(forwardStrand ? "+" : "-");
 521         sf.setPhase(String.valueOf(codonStart - 1));
 522         sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 523         sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
 524
 525         dna.addSequenceFeature(sf);
 526       }
 527     }
 528
 529     /*
 530      * add feature dbRefs to sequence, and mappings for Uniprot xrefs
 531      */
 532     boolean hasUniprotDbref = false;
 533     List<XrefType> xrefs = feature.getXref();
 534     if (xrefs != null)
 535     {
 536       boolean mappingUsed = false;
 537       for (XrefType xref : xrefs)
 538       {
 539         /*
 540          * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 541          */
 542         String source = DBRefUtils.getCanonicalName(xref.getDb());
 543         String version = xref.getSecondaryId();
 544         if (version == null || "".equals(version))
 545         {
 546           version = "0";
 547         }
 548         DBRefEntry dbref = new DBRefEntry(source, version, xref.getId());
 549         DBRefEntry proteinDbRef = new DBRefEntry(source, version,
 550                 dbref.getAccessionId());
 551         if (source.equals(DBRefSource.UNIPROT))
 552         {
 553           String proteinSeqName = DBRefSource.UNIPROT + "|"
 554                   + dbref.getAccessionId();
 555           if (dnaToProteinMapping != null
 556                   && dnaToProteinMapping.getTo() != null)
 557           {
 558             if (mappingUsed)
 559             {
 560               /*
 561                * two or more Uniprot xrefs for the same CDS -
 562                * each needs a distinct Mapping (as to a different sequence)
 563                */
 564               dnaToProteinMapping = new Mapping(dnaToProteinMapping);
 565             }
 566             mappingUsed = true;
 567
 568             /*
 569              * try to locate the protein mapped to (possibly by a
 570              * previous CDS feature); if not found, construct it from
 571              * the EMBL translation
 572              */
 573             SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
 574             if (proteinSeq == null)
 575             {
 576               proteinSeq = new Sequence(proteinSeqName,
 577                       product.getSequenceAsString());
 578               matcher.add(proteinSeq);
 579               proteinSeq.setDescription(product.getDescription());
 580               peptides.add(proteinSeq);
 581             }
 582             dnaToProteinMapping.setTo(proteinSeq);
 583             dnaToProteinMapping.setMappedFromId(proteinId);
 584             proteinSeq.addDBRef(proteinDbRef);
 585             dbref.setMap(dnaToProteinMapping);
 586           }
 587           hasUniprotDbref = true;
 588         }
 589         if (product != null)
 590         {
 591           /*
 592            * copy feature dbref to our protein product
 593            */
 594           DBRefEntry pref = proteinDbRef;
 595           pref.setMap(null); // reference is direct
 596           product.addDBRef(pref);
 597           // Add converse mapping reference
 598           if (dnaToProteinMapping != null)
 599           {
 600             Mapping pmap = new Mapping(dna,
 601                     dnaToProteinMapping.getMap().getInverse());
 602             pref = new DBRefEntry(sourceDb, sequenceVersion, accession);
 603             pref.setMap(pmap);
 604             if (dnaToProteinMapping.getTo() != null)
 605             {
 606               dnaToProteinMapping.getTo().addDBRef(pref);
 607             }
 608           }
 609         }
 610         dna.addDBRef(dbref);
 611       }
 612     }
 613
 614     /*
 615      * if we have a product (translation) but no explicit Uniprot dbref
 616      * (example: EMBL AAFI02000057 protein_id EAL65544.1)
 617      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 618      */
 619     if (!hasUniprotDbref && product != null)
 620     {
 621       if (proteinToEmblProteinRef == null)
 622       {
 623         // assuming CDSPROTEIN sequence version = dna version (?!)
 624         proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 625                 sequenceVersion, proteinId);
 626       }
 627       product.addDBRef(proteinToEmblProteinRef);
 628
 629       if (dnaToProteinMapping != null
 630               && dnaToProteinMapping.getTo() != null)
 631       {
 632         DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 633                 DBRefSource.EMBLCDSProduct, sequenceVersion, proteinId);
 634         dnaToEmblProteinRef.setMap(dnaToProteinMapping);
 635         dnaToProteinMapping.setMappedFromId(proteinId);
 636         dna.addDBRef(dnaToEmblProteinRef);
 637       }
 638     }
 639   }
 640
 641   @Override
 642   public boolean isDnaCoding()
 643   {
 644     return true;
 645   }
 646
 647   /**
 648    * Returns the CDS positions as a single array of [start, end, start, end...]
 649    * positions. If on the reverse strand, these will be in descending order.
 650    *
 651    * @param accession
 652    * @param feature
 653    * @return
 654    */
 655   protected int[] getCdsRanges(String accession, Feature feature)
 656   {
 657     String location = feature.getLocation();
 658     if (location == null)
 659     {
 660       return new int[] {};
 661     }
 662     try
 663     {
 664       List<int[]> ranges = DnaUtils.parseLocation(location);
 665       return listToArray(ranges);
 666     } catch (ParseException e)
 667     {
 668       Cache.log.warn(
 669               String.format("Not parsing inexact CDS location %s in ENA %s",
 670                       location, accession));
 671       return new int[] {};
 672     }
 673   }
 674
 675   /**
 676    * Converts a list of [start, end] ranges to a single array of [start, end,
 677    * start, end ...]
 678    *
 679    * @param ranges
 680    * @return
 681    */
 682   int[] listToArray(List<int[]> ranges)
 683   {
 684     int[] result = new int[ranges.size() * 2];
 685     int i = 0;
 686     for (int[] range : ranges)
 687     {
 688       result[i++] = range[0];
 689       result[i++] = range[1];
 690     }
 691     return result;
 692   }
 693
 694   /**
 695    * Helper method to construct a SequenceFeature for one cds range
 696    *
 697    * @param type
 698    *          feature type ("CDS")
 699    * @param desc
 700    *          description
 701    * @param begin
 702    *          start position
 703    * @param end
 704    *          end position
 705    * @param group
 706    *          feature group
 707    * @param vals
 708    *          map of 'miscellaneous values' for feature
 709    * @return
 710    */
 711   protected SequenceFeature makeCdsFeature(String type, String desc,
 712           int begin, int end, String group, Map<String, String> vals)
 713   {
 714     SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
 715     if (!vals.isEmpty())
 716     {
 717       for (Entry<String, String> val : vals.entrySet())
 718       {
 719         sf.setValue(val.getKey(), val.getValue());
 720       }
 721     }
 722     return sf;
 723   }
 724
 725   @Override
 726   public String getAccessionSeparator()
 727   {
 728     return null;
 729   }
 730
 731   @Override
 732   public Regex getAccessionValidator()
 733   {
 734     return ACCESSION_REGEX;
 735   }
 736
 737   @Override
 738   public String getDbVersion()
 739   {
 740     return "0";
 741   }
 742
 743   @Override
 744   public int getTier()
 745   {
 746     return 0;
 747   }
 748
 749   @Override
 750   public boolean isValidReference(String accession)
 751   {
 752     if (accession == null || accession.length() < 2)
 753     {
 754       return false;
 755     }
 756     return getAccessionValidator().search(accession);
 757   }
 758   /**
 759    * Truncates (if necessary) the exon intervals to match 3 times the length of
 760    * the protein; also accepts 3 bases longer (for stop codon not included in
 761    * protein)
 762    *
 763    * @param proteinLength
 764    * @param exon
 765    *          an array of [start, end, start, end...] intervals
 766    * @return the same array (if unchanged) or a truncated copy
 767    */
 768   static int[] adjustForProteinLength(int proteinLength, int[] exon)
 769   {
 770     if (proteinLength <= 0 || exon == null)
 771     {
 772       return exon;
 773     }
 774     int expectedCdsLength = proteinLength * 3;
 775     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 776     /*
 777      * if exon length matches protein, or is shorter, or longer by the
 778      * length of a stop codon (3 bases), then leave it unchanged
 779      */
 780     if (expectedCdsLength >= exonLength
 781             || expectedCdsLength == exonLength - 3)
 782     {
 783       return exon;
 784     }
 785     int origxon[];
 786     int sxpos = -1;
 787     int endxon = 0;
 788     origxon = new int[exon.length];
 789     System.arraycopy(exon, 0, origxon, 0, exon.length);
 790     int cdspos = 0;
 791     for (int x = 0; x < exon.length; x += 2)
 792     {
 793       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 794       if (expectedCdsLength <= cdspos)
 795       {
 796         // advanced beyond last codon.
 797         sxpos = x;
 798         if (expectedCdsLength != cdspos)
 799         {
 800           // System.err
 801           // .println("Truncating final exon interval on region by "
 802           // + (cdspos - cdslength));
 803         }
 804         /*
 805          * shrink the final exon - reduce end position if forward
 806          * strand, increase it if reverse
 807          */
 808         if (exon[x + 1] >= exon[x])
 809         {
 810           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 811         }
 812         else
 813         {
 814           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 815         }
 816         break;
 817       }
 818     }
 819     if (sxpos != -1)
 820     {
 821       // and trim the exon interval set if necessary
 822       int[] nxon = new int[sxpos + 2];
 823       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 824       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 825                                 // set
 826       exon = nxon;
 827     }
 828     return exon;
 829   }
 830
 831 }