src/jalview/ws/dbsources/EmblXmlSource.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.ws.dbsources;
  22
  23 import java.io.File;
  24 import java.io.FileInputStream;
  25 import java.io.InputStream;
  26 import java.text.ParseException;
  27 import java.util.ArrayList;
  28 import java.util.Arrays;
  29 import java.util.Hashtable;
  30 import java.util.List;
  31 import java.util.Map;
  32 import java.util.Map.Entry;
  33
  34 import javax.xml.bind.JAXBContext;
  35 import javax.xml.bind.JAXBElement;
  36 import javax.xml.bind.JAXBException;
  37 import javax.xml.stream.FactoryConfigurationError;
  38 import javax.xml.stream.XMLInputFactory;
  39 import javax.xml.stream.XMLStreamException;
  40 import javax.xml.stream.XMLStreamReader;
  41
  42 import com.stevesoft.pat.Regex;
  43
  44 import jalview.analysis.SequenceIdMatcher;
  45 import jalview.bin.Cache;
  46 import jalview.datamodel.Alignment;
  47 import jalview.datamodel.AlignmentI;
  48 import jalview.datamodel.DBRefEntry;
  49 import jalview.datamodel.DBRefSource;
  50 import jalview.datamodel.FeatureProperties;
  51 import jalview.datamodel.Mapping;
  52 import jalview.datamodel.Sequence;
  53 import jalview.datamodel.SequenceFeature;
  54 import jalview.datamodel.SequenceI;
  55 import jalview.util.DBRefUtils;
  56 import jalview.util.DnaUtils;
  57 import jalview.util.MapList;
  58 import jalview.util.MappingUtils;
  59 import jalview.ws.ebi.EBIFetchClient;
  60 import jalview.xml.binding.embl.EntryType;
  61 import jalview.xml.binding.embl.EntryType.Feature;
  62 import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
  63 import jalview.xml.binding.embl.ROOT;
  64 import jalview.xml.binding.embl.XrefType;
  65
  66 /**
  67  * Provides XML binding and parsing of EMBL or EMBLCDS records retrieved from
  68  * (e.g.) {@code https://www.ebi.ac.uk/ena/data/view/x53828&display=xml}.
  69  *
  70  * @deprecated endpoint withdrawn August 2020 (JAL-3692), use EmblFlatfileSource
  71  */
  72
  73 public abstract class EmblXmlSource extends EbiFileRetrievedProxy
  74 {
  75   private static final Regex ACCESSION_REGEX = new Regex("^[A-Z]+[0-9]+");
  76
  77   /*
  78    * JAL-1856 Embl returns this text for query not found
  79    */
  80   private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found.";
  81
  82   public EmblXmlSource()
  83   {
  84     super();
  85   }
  86
  87   /**
  88    * Retrieves and parses an emblxml file, and returns an alignment containing
  89    * the parsed sequences, or null if none were found
  90    *
  91    * @param emprefx
  92    *          "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml
  93    * @param query
  94    * @return
  95    * @throws Exception
  96    */
  97   protected AlignmentI getEmblSequenceRecords(String emprefx, String query)
  98           throws Exception
  99   {
 100     startQuery();
 101     EBIFetchClient dbFetch = new EBIFetchClient();
 102     File reply;
 103     try
 104     {
 105       reply = dbFetch.fetchDataAsFile(
 106               emprefx.toLowerCase() + ":" + query.trim(), "display=xml",
 107               "xml");
 108     } catch (Exception e)
 109     {
 110       stopQuery();
 111       throw new Exception(
 112               String.format("EBI EMBL XML retrieval failed for %s:%s",
 113                       emprefx.toLowerCase(), query.trim()),
 114               e);
 115     }
 116     return getEmblSequenceRecords(emprefx, query, reply);
 117   }
 118
 119   /**
 120    * parse an emblxml file stored locally
 121    *
 122    * @param emprefx
 123    *          either EMBL or EMBLCDS strings are allowed - anything else will
 124    *          not retrieve emblxml
 125    * @param query
 126    * @param file
 127    *          the EMBL XML file containing the results of a query
 128    * @return
 129    * @throws Exception
 130    */
 131   protected AlignmentI getEmblSequenceRecords(String emprefx, String query,
 132           File reply) throws Exception
 133   {
 134     List<EntryType> entries = null;
 135     if (reply != null && reply.exists())
 136     {
 137       file = reply.getAbsolutePath();
 138       if (reply.length() > EMBL_NOT_FOUND_REPLY.length())
 139       {
 140         InputStream is = new FileInputStream(reply);
 141         entries = getEmblEntries(is);
 142       }
 143     }
 144
 145     /*
 146      * invalid accession gets a reply with no <entry> elements, text content of
 147      * EmbFile reads something like (e.g.) this ungrammatical phrase
 148      * Entry: <acc> display type is either not supported or entry is not found.
 149      */
 150     AlignmentI al = null;
 151     List<SequenceI> seqs = new ArrayList<>();
 152     List<SequenceI> peptides = new ArrayList<>();
 153     if (entries != null)
 154     {
 155       for (EntryType entry : entries)
 156       {
 157         SequenceI seq = getSequence(emprefx, entry, peptides);
 158         if (seq != null)
 159         {
 160           seqs.add(seq.deriveSequence());
 161           // place DBReferences on dataset and refer
 162         }
 163       }
 164       if (!seqs.isEmpty())
 165       {
 166         al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
 167       }
 168       else
 169       {
 170         System.out.println(
 171                 "No record found for '" + emprefx + ":" + query + "'");
 172       }
 173     }
 174
 175     stopQuery();
 176     return al;
 177   }
 178
 179   /**
 180    * Reads the XML reply from file and unmarshals it to Java objects. Answers a
 181    * (possibly empty) list of <code>EntryType</code> objects.
 182    *
 183    * is
 184    *
 185    * @return
 186    */
 187   List<EntryType> getEmblEntries(InputStream is)
 188   {
 189     List<EntryType> entries = new ArrayList<>();
 190     try
 191     {
 192       JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl");
 193       XMLStreamReader streamReader = XMLInputFactory.newInstance()
 194               .createXMLStreamReader(is);
 195       javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
 196       JAXBElement<ROOT> rootElement = um.unmarshal(streamReader,
 197               ROOT.class);
 198       ROOT root = rootElement.getValue();
 199
 200       /*
 201        * document root contains either "entry" or "entrySet"
 202        */
 203       if (root == null)
 204       {
 205         return entries;
 206       }
 207       if (root.getEntrySet() != null)
 208       {
 209         entries = root.getEntrySet().getEntry();
 210       }
 211       else if (root.getEntry() != null)
 212       {
 213         entries.add(root.getEntry());
 214       }
 215     } catch (JAXBException | XMLStreamException
 216             | FactoryConfigurationError e)
 217     {
 218       e.printStackTrace();
 219     }
 220     return entries;
 221   }
 222
 223   /**
 224    * A helper method to parse XML data and construct a sequence, with any
 225    * available database references and features
 226    *
 227    * @param emprefx
 228    * @param entry
 229    * @param peptides
 230    * @return
 231    */
 232   SequenceI getSequence(String sourceDb, EntryType entry,
 233           List<SequenceI> peptides)
 234   {
 235     String seqString = entry.getSequence();
 236     if (seqString == null)
 237     {
 238       return null;
 239     }
 240     seqString = seqString.replace(" ", "").replace("\n", "").replace("\t",
 241             "");
 242     String accession = entry.getAccession();
 243     SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString);
 244
 245     dna.setDescription(entry.getDescription());
 246     String sequenceVersion = String.valueOf(entry.getVersion().intValue());
 247     DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion,
 248             accession);
 249     dna.addDBRef(selfRref);
 250     selfRref.setMap(
 251             new Mapping(null, new int[]
 252             { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
 253                     1));
 254
 255     /*
 256      * add db references
 257      */
 258     List<XrefType> xrefs = entry.getXref();
 259     if (xrefs != null)
 260     {
 261       for (XrefType xref : xrefs)
 262       {
 263         String acc = xref.getId();
 264         String source = DBRefUtils.getCanonicalName(xref.getDb());
 265         String version = xref.getSecondaryId();
 266         if (version == null || "".equals(version))
 267         {
 268           version = "0";
 269         }
 270         dna.addDBRef(new DBRefEntry(source, version, acc));
 271       }
 272     }
 273
 274     SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
 275     try
 276     {
 277       List<Feature> features = entry.getFeature();
 278       if (features != null)
 279       {
 280         for (Feature feature : features)
 281         {
 282           if (FeatureProperties.isCodingFeature(sourceDb,
 283                   feature.getName()))
 284           {
 285             parseCodingFeature(entry, feature, sourceDb, dna, peptides,
 286                     matcher);
 287           }
 288         }
 289       }
 290     } catch (Exception e)
 291     {
 292       System.err.println("EMBL Record Features parsing error!");
 293       System.err
 294               .println("Please report the following to help@jalview.org :");
 295       System.err.println("EMBL Record " + accession);
 296       System.err.println("Resulted in exception: " + e.getMessage());
 297       e.printStackTrace(System.err);
 298     }
 299
 300     return dna;
 301   }
 302
 303   /**
 304    * Extracts coding region and product from a CDS feature and decorates it with
 305    * annotations
 306    *
 307    * @param entry
 308    * @param feature
 309    * @param sourceDb
 310    * @param dna
 311    * @param peptides
 312    * @param matcher
 313    */
 314   void parseCodingFeature(EntryType entry, Feature feature, String sourceDb,
 315           SequenceI dna, List<SequenceI> peptides,
 316           SequenceIdMatcher matcher)
 317   {
 318     final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
 319     final String accession = entry.getAccession();
 320     final String sequenceVersion = entry.getVersion().toString();
 321
 322     int[] exons = getCdsRanges(entry.getAccession(), feature);
 323
 324     String translation = null;
 325     String proteinName = "";
 326     String proteinId = null;
 327     Map<String, String> vals = new Hashtable<>();
 328
 329     /*
 330      * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
 331      * (phase is required for CDS features in GFF3 format)
 332      */
 333     int codonStart = 1;
 334
 335     /*
 336      * parse qualifiers, saving protein translation, protein id,
 337      * codon start position, product (name), and 'other values'
 338      */
 339     if (feature.getQualifier() != null)
 340     {
 341       for (Qualifier q : feature.getQualifier())
 342       {
 343         String qname = q.getName();
 344         String value = q.getValue();
 345         value = value == null ? ""
 346                 : value.trim().replace(" ", "").replace("\n", "")
 347                         .replace("\t", "");
 348         if (qname.equals("translation"))
 349         {
 350           translation = value;
 351         }
 352         else if (qname.equals("protein_id"))
 353         {
 354           proteinId = value;
 355         }
 356         else if (qname.equals("codon_start"))
 357         {
 358           try
 359           {
 360             codonStart = Integer.parseInt(value.trim());
 361           } catch (NumberFormatException e)
 362           {
 363             System.err.println("Invalid codon_start in XML for "
 364                     + entry.getAccession() + ": " + e.getMessage());
 365           }
 366         }
 367         else if (qname.equals("product"))
 368         {
 369           // sometimes name is returned e.g. for V00488
 370           proteinName = value;
 371         }
 372         else
 373         {
 374           // throw anything else into the additional properties hash
 375           if (!"".equals(value))
 376           {
 377             vals.put(qname, value);
 378           }
 379         }
 380       }
 381     }
 382
 383     DBRefEntry proteinToEmblProteinRef = null;
 384     exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
 385
 386     SequenceI product = null;
 387     Mapping dnaToProteinMapping = null;
 388     if (translation != null && proteinName != null && proteinId != null)
 389     {
 390       int translationLength = translation.length();
 391
 392       /*
 393        * look for product in peptides list, if not found, add it
 394        */
 395       product = matcher.findIdMatch(proteinId);
 396       if (product == null)
 397       {
 398         product = new Sequence(proteinId, translation, 1,
 399                 translationLength);
 400         product.setDescription(((proteinName.length() == 0)
 401                 ? "Protein Product from " + sourceDb
 402                 : proteinName));
 403         peptides.add(product);
 404         matcher.add(product);
 405       }
 406
 407       // we have everything - create the mapping and perhaps the protein
 408       // sequence
 409       if (exons == null || exons.length == 0)
 410       {
 411         /*
 412          * workaround until we handle dna location for CDS sequence
 413          * e.g. location="X53828.1:60..1058" correctly
 414          */
 415         System.err.println(
 416                 "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
 417                         + sourceDb + ":" + entry.getAccession() + ")");
 418         int dnaLength = dna.getLength();
 419         if (translationLength * 3 == (1 - codonStart + dnaLength))
 420         {
 421           System.err.println(
 422                   "Not allowing for additional stop codon at end of cDNA fragment... !");
 423           // this might occur for CDS sequences where no features are marked
 424           exons = new int[] { dna.getStart() + (codonStart - 1),
 425               dna.getEnd() };
 426           dnaToProteinMapping = new Mapping(product, exons,
 427                   new int[]
 428                   { 1, translationLength }, 3, 1);
 429         }
 430         if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
 431         {
 432           System.err.println(
 433                   "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
 434           exons = new int[] { dna.getStart() + (codonStart - 1),
 435               dna.getEnd() - 3 };
 436           dnaToProteinMapping = new Mapping(product, exons,
 437                   new int[]
 438                   { 1, translationLength }, 3, 1);
 439         }
 440       }
 441       else
 442       {
 443         // Trim the exon mapping if necessary - the given product may only be a
 444         // fragment of a larger protein. (EMBL:AY043181 is an example)
 445
 446         if (isEmblCdna)
 447         {
 448           // TODO: Add a DbRef back to the parent EMBL sequence with the exon
 449           // map
 450           // if given a dataset reference, search dataset for parent EMBL
 451           // sequence if it exists and set its map
 452           // make a new feature annotating the coding contig
 453         }
 454         else
 455         {
 456           // final product length truncation check
 457           int [] exons2 = adjustForProteinLength(translationLength,
 458                   exons);
 459           dnaToProteinMapping = new Mapping(product, exons2,
 460                   new int[]
 461                   { 1, translationLength }, 3, 1);
 462           if (product != null)
 463           {
 464             /*
 465              * make xref with mapping from protein to EMBL dna
 466              */
 467             DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
 468                     sequenceVersion, proteinId,
 469                     new Mapping(dnaToProteinMapping.getMap().getInverse()));
 470             product.addDBRef(proteinToEmblRef);
 471
 472             /*
 473              * make xref from protein to EMBLCDS; we assume here that the
 474              * CDS sequence version is same as dna sequence (?!)
 475              */
 476             MapList proteinToCdsMapList = new MapList(
 477                     new int[]
 478                     { 1, translationLength },
 479                     new int[]
 480                     { 1 + (codonStart - 1),
 481                         (codonStart - 1) + 3 * translationLength },
 482                     1, 3);
 483             DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
 484                     DBRefSource.EMBLCDS, sequenceVersion, proteinId,
 485                     new Mapping(proteinToCdsMapList));
 486             product.addDBRef(proteinToEmblCdsRef);
 487
 488             /*
 489              * make 'direct' xref from protein to EMBLCDSPROTEIN
 490              */
 491             proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
 492             proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
 493             proteinToEmblProteinRef.setMap(null);
 494             product.addDBRef(proteinToEmblProteinRef);
 495           }
 496         }
 497       }
 498
 499       /*
 500        * add cds features to dna sequence
 501        */
 502       String cds = feature.getName(); // "CDS"
 503       for (int xint = 0; exons != null
 504               && xint < exons.length - 1; xint += 2)
 505       {
 506         int exonStart = exons[xint];
 507         int exonEnd = exons[xint + 1];
 508         int begin = Math.min(exonStart, exonEnd);
 509         int end = Math.max(exonStart, exonEnd);
 510         int exonNumber = xint / 2 + 1;
 511         String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
 512                 exonNumber, proteinName, proteinId);
 513
 514         SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb,
 515                 vals);
 516
 517         sf.setEnaLocation(feature.getLocation());
 518         boolean forwardStrand = exonStart <= exonEnd;
 519         sf.setStrand(forwardStrand ? "+" : "-");
 520         sf.setPhase(String.valueOf(codonStart - 1));
 521         sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 522         sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
 523
 524         dna.addSequenceFeature(sf);
 525       }
 526     }
 527
 528     /*
 529      * add feature dbRefs to sequence, and mappings for Uniprot xrefs
 530      */
 531     boolean hasUniprotDbref = false;
 532     List<XrefType> xrefs = feature.getXref();
 533     if (xrefs != null)
 534     {
 535       boolean mappingUsed = false;
 536       for (XrefType xref : xrefs)
 537       {
 538         /*
 539          * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 540          */
 541         String source = DBRefUtils.getCanonicalName(xref.getDb());
 542         String version = xref.getSecondaryId();
 543         if (version == null || "".equals(version))
 544         {
 545           version = "0";
 546         }
 547         DBRefEntry dbref = new DBRefEntry(source, version, xref.getId());
 548         DBRefEntry proteinDbRef = new DBRefEntry(source, version,
 549                 dbref.getAccessionId());
 550         if (source.equals(DBRefSource.UNIPROT))
 551         {
 552           String proteinSeqName = DBRefSource.UNIPROT + "|"
 553                   + dbref.getAccessionId();
 554           if (dnaToProteinMapping != null
 555                   && dnaToProteinMapping.getTo() != null)
 556           {
 557             if (mappingUsed)
 558             {
 559               /*
 560                * two or more Uniprot xrefs for the same CDS -
 561                * each needs a distinct Mapping (as to a different sequence)
 562                */
 563               dnaToProteinMapping = new Mapping(dnaToProteinMapping);
 564             }
 565             mappingUsed = true;
 566
 567             /*
 568              * try to locate the protein mapped to (possibly by a
 569              * previous CDS feature); if not found, construct it from
 570              * the EMBL translation
 571              */
 572             SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
 573             if (proteinSeq == null)
 574             {
 575               proteinSeq = new Sequence(proteinSeqName,
 576                       product.getSequenceAsString());
 577               matcher.add(proteinSeq);
 578               proteinSeq.setDescription(product.getDescription());
 579               peptides.add(proteinSeq);
 580             }
 581             dnaToProteinMapping.setTo(proteinSeq);
 582             dnaToProteinMapping.setMappedFromId(proteinId);
 583             proteinSeq.addDBRef(proteinDbRef);
 584             dbref.setMap(dnaToProteinMapping);
 585           }
 586           hasUniprotDbref = true;
 587         }
 588         if (product != null)
 589         {
 590           /*
 591            * copy feature dbref to our protein product
 592            */
 593           DBRefEntry pref = proteinDbRef;
 594           pref.setMap(null); // reference is direct
 595           product.addDBRef(pref);
 596           // Add converse mapping reference
 597           if (dnaToProteinMapping != null)
 598           {
 599             Mapping pmap = new Mapping(dna,
 600                     dnaToProteinMapping.getMap().getInverse());
 601             pref = new DBRefEntry(sourceDb, sequenceVersion, accession);
 602             pref.setMap(pmap);
 603             if (dnaToProteinMapping.getTo() != null)
 604             {
 605               dnaToProteinMapping.getTo().addDBRef(pref);
 606             }
 607           }
 608         }
 609         dna.addDBRef(dbref);
 610       }
 611     }
 612
 613     /*
 614      * if we have a product (translation) but no explicit Uniprot dbref
 615      * (example: EMBL AAFI02000057 protein_id EAL65544.1)
 616      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 617      */
 618     if (!hasUniprotDbref && product != null)
 619     {
 620       if (proteinToEmblProteinRef == null)
 621       {
 622         // assuming CDSPROTEIN sequence version = dna version (?!)
 623         proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 624                 sequenceVersion, proteinId);
 625       }
 626       product.addDBRef(proteinToEmblProteinRef);
 627
 628       if (dnaToProteinMapping != null
 629               && dnaToProteinMapping.getTo() != null)
 630       {
 631         DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 632                 DBRefSource.EMBLCDSProduct, sequenceVersion, proteinId);
 633         dnaToEmblProteinRef.setMap(dnaToProteinMapping);
 634         dnaToProteinMapping.setMappedFromId(proteinId);
 635         dna.addDBRef(dnaToEmblProteinRef);
 636       }
 637     }
 638   }
 639
 640   @Override
 641   public boolean isDnaCoding()
 642   {
 643     return true;
 644   }
 645
 646   /**
 647    * Returns the CDS positions as a single array of [start, end, start, end...]
 648    * positions. If on the reverse strand, these will be in descending order.
 649    *
 650    * @param accession
 651    * @param feature
 652    * @return
 653    */
 654   protected int[] getCdsRanges(String accession, Feature feature)
 655   {
 656     String location = feature.getLocation();
 657     if (location == null)
 658     {
 659       return new int[] {};
 660     }
 661
 662     try
 663     {
 664       List<int[]> ranges = DnaUtils.parseLocation(location);
 665       return listToArray(ranges);
 666     } catch (ParseException e)
 667     {
 668       Cache.log.warn(
 669               String.format("Not parsing inexact CDS location %s in ENA %s",
 670                       location, accession));
 671       return new int[] {};
 672     }
 673   }
 674
 675   /**
 676    * Converts a list of [start, end] ranges to a single array of [start, end,
 677    * start, end ...]
 678    *
 679    * @param ranges
 680    * @return
 681    */
 682   int[] listToArray(List<int[]> ranges)
 683   {
 684     int[] result = new int[ranges.size() * 2];
 685     int i = 0;
 686     for (int[] range : ranges)
 687     {
 688       result[i++] = range[0];
 689       result[i++] = range[1];
 690     }
 691     return result;
 692   }
 693
 694   /**
 695    * Helper method to construct a SequenceFeature for one cds range
 696    *
 697    * @param type
 698    *          feature type ("CDS")
 699    * @param desc
 700    *          description
 701    * @param begin
 702    *          start position
 703    * @param end
 704    *          end position
 705    * @param group
 706    *          feature group
 707    * @param vals
 708    *          map of 'miscellaneous values' for feature
 709    * @return
 710    */
 711   protected SequenceFeature makeCdsFeature(String type, String desc,
 712           int begin, int end, String group, Map<String, String> vals)
 713   {
 714     SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
 715     if (!vals.isEmpty())
 716     {
 717       for (Entry<String, String> val : vals.entrySet())
 718       {
 719         sf.setValue(val.getKey(), val.getValue());
 720       }
 721     }
 722     return sf;
 723   }
 724
 725   @Override
 726   public String getAccessionSeparator()
 727   {
 728     return null;
 729   }
 730
 731   @Override
 732   public Regex getAccessionValidator()
 733   {
 734     return ACCESSION_REGEX;
 735   }
 736
 737   @Override
 738   public String getDbVersion()
 739   {
 740     return "0";
 741   }
 742
 743   @Override
 744   public int getTier()
 745   {
 746     return 0;
 747   }
 748
 749   @Override
 750   public boolean isValidReference(String accession)
 751   {
 752     if (accession == null || accession.length() < 2)
 753     {
 754       return false;
 755     }
 756     return getAccessionValidator().search(accession);
 757   }
 758
 759   /**
 760    * Truncates (if necessary) the exon intervals to match 3 times the length of
 761    * the protein (including truncation for stop codon included in exon)
 762    *
 763    * @param proteinLength
 764    * @param exon
 765    *          an array of [start, end, start, end...] intervals
 766    * @return the same array (if unchanged) or a truncated copy
 767    */
 768   static int[] adjustForProteinLength(int proteinLength, int[] exon)
 769   {
 770     if (proteinLength <= 0 || exon == null)
 771     {
 772       return exon;
 773     }
 774     int expectedCdsLength = proteinLength * 3;
 775     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 776
 777     /*
 778      * if exon length matches protein, or is shorter, then leave it unchanged
 779      */
 780     if (expectedCdsLength >= exonLength)
 781     {
 782       return exon;
 783     }
 784
 785     int origxon[];
 786     int sxpos = -1;
 787     int endxon = 0;
 788     origxon = new int[exon.length];
 789     System.arraycopy(exon, 0, origxon, 0, exon.length);
 790     int cdspos = 0;
 791     for (int x = 0; x < exon.length; x += 2)
 792     {
 793       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 794       if (expectedCdsLength <= cdspos)
 795       {
 796         // advanced beyond last codon.
 797         sxpos = x;
 798         if (expectedCdsLength != cdspos)
 799         {
 800           // System.err
 801           // .println("Truncating final exon interval on region by "
 802           // + (cdspos - cdslength));
 803         }
 804
 805         /*
 806          * shrink the final exon - reduce end position if forward
 807          * strand, increase it if reverse
 808          */
 809         if (exon[x + 1] >= exon[x])
 810         {
 811           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 812         }
 813         else
 814         {
 815           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 816         }
 817         break;
 818       }
 819     }
 820
 821     if (sxpos != -1)
 822     {
 823       // and trim the exon interval set if necessary
 824       int[] nxon = new int[sxpos + 2];
 825       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 826       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 827                                 // set
 828       exon = nxon;
 829     }
 830     return exon;
 831   }
 832
 833 }