src/jalview/ws/dbsources/EmblXmlSource.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.ws.dbsources;
  22
  23 import java.io.File;
  24 import java.io.FileInputStream;
  25 import java.io.InputStream;
  26 import java.text.ParseException;
  27 import java.util.ArrayList;
  28 import java.util.Arrays;
  29 import java.util.Hashtable;
  30 import java.util.List;
  31 import java.util.Locale;
  32 import java.util.Map;
  33 import java.util.Map.Entry;
  34
  35 import javax.xml.bind.JAXBContext;
  36 import javax.xml.bind.JAXBElement;
  37 import javax.xml.bind.JAXBException;
  38 import javax.xml.stream.FactoryConfigurationError;
  39 import javax.xml.stream.XMLInputFactory;
  40 import javax.xml.stream.XMLStreamException;
  41 import javax.xml.stream.XMLStreamReader;
  42
  43 import com.stevesoft.pat.Regex;
  44
  45 import jalview.analysis.SequenceIdMatcher;
  46 import jalview.bin.Console;
  47 import jalview.datamodel.Alignment;
  48 import jalview.datamodel.AlignmentI;
  49 import jalview.datamodel.DBRefEntry;
  50 import jalview.datamodel.DBRefSource;
  51 import jalview.datamodel.FeatureProperties;
  52 import jalview.datamodel.Mapping;
  53 import jalview.datamodel.Sequence;
  54 import jalview.datamodel.SequenceFeature;
  55 import jalview.datamodel.SequenceI;
  56 import jalview.util.DBRefUtils;
  57 import jalview.util.DnaUtils;
  58 import jalview.util.MapList;
  59 import jalview.util.MappingUtils;
  60 import jalview.ws.ebi.EBIFetchClient;
  61 import jalview.xml.binding.embl.EntryType;
  62 import jalview.xml.binding.embl.EntryType.Feature;
  63 import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
  64 import jalview.xml.binding.embl.ROOT;
  65 import jalview.xml.binding.embl.XrefType;
  66
  67 public abstract class EmblXmlSource extends EbiFileRetrievedProxy
  68 {
  69   private static final Regex ACCESSION_REGEX = new Regex("^[A-Z]+[0-9]+");
  70
  71   /*
  72    * JAL-1856 Embl returns this text for query not found
  73    */
  74   private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found.";
  75
  76   public EmblXmlSource()
  77   {
  78     super();
  79   }
  80
  81   /**
  82    * Retrieves and parses an emblxml file, and returns an alignment containing
  83    * the parsed sequences, or null if none were found
  84    *
  85    * @param emprefx
  86    *          "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml
  87    * @param query
  88    * @return
  89    * @throws Exception
  90    */
  91   protected AlignmentI getEmblSequenceRecords(String emprefx, String query)
  92           throws Exception
  93   {
  94     startQuery();
  95     EBIFetchClient dbFetch = new EBIFetchClient();
  96     File reply;
  97     try
  98     {
  99       reply = dbFetch.fetchDataAsFile(
 100               emprefx.toLowerCase(Locale.ROOT) + ":" + query.trim(),
 101               "display=xml", "xml");
 102     } catch (Exception e)
 103     {
 104       stopQuery();
 105       throw new Exception(
 106               String.format("EBI EMBL XML retrieval failed for %s:%s",
 107                       emprefx.toLowerCase(Locale.ROOT), query.trim()),
 108               e);
 109     }
 110     return getEmblSequenceRecords(emprefx, query, reply);
 111   }
 112
 113   /**
 114    * parse an emblxml file stored locally
 115    *
 116    * @param emprefx
 117    *          either EMBL or EMBLCDS strings are allowed - anything else will
 118    *          not retrieve emblxml
 119    * @param query
 120    * @param file
 121    *          the EMBL XML file containing the results of a query
 122    * @return
 123    * @throws Exception
 124    */
 125   protected AlignmentI getEmblSequenceRecords(String emprefx, String query,
 126           File reply) throws Exception
 127   {
 128     List<EntryType> entries = null;
 129     if (reply != null && reply.exists())
 130     {
 131       file = reply.getAbsolutePath();
 132       if (reply.length() > EMBL_NOT_FOUND_REPLY.length())
 133       {
 134         InputStream is = new FileInputStream(reply);
 135         entries = getEmblEntries(is);
 136       }
 137     }
 138
 139     /*
 140      * invalid accession gets a reply with no <entry> elements, text content of
 141      * EmbFile reads something like (e.g.) this ungrammatical phrase
 142      * Entry: <acc> display type is either not supported or entry is not found.
 143      */
 144     AlignmentI al = null;
 145     List<SequenceI> seqs = new ArrayList<>();
 146     List<SequenceI> peptides = new ArrayList<>();
 147     if (entries != null)
 148     {
 149       for (EntryType entry : entries)
 150       {
 151         SequenceI seq = getSequence(emprefx, entry, peptides);
 152         if (seq != null)
 153         {
 154           seqs.add(seq.deriveSequence());
 155           // place DBReferences on dataset and refer
 156         }
 157       }
 158       if (!seqs.isEmpty())
 159       {
 160         al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
 161       }
 162       else
 163       {
 164         jalview.bin.Console.outPrintln(
 165                 "No record found for '" + emprefx + ":" + query + "'");
 166       }
 167     }
 168
 169     stopQuery();
 170     return al;
 171   }
 172
 173   /**
 174    * Reads the XML reply from file and unmarshals it to Java objects. Answers a
 175    * (possibly empty) list of <code>EntryType</code> objects.
 176    *
 177    * is
 178    *
 179    * @return
 180    */
 181   List<EntryType> getEmblEntries(InputStream is)
 182   {
 183     List<EntryType> entries = new ArrayList<>();
 184     try
 185     {
 186       JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl");
 187       XMLStreamReader streamReader = XMLInputFactory.newInstance()
 188               .createXMLStreamReader(is);
 189       javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
 190       JAXBElement<ROOT> rootElement = um.unmarshal(streamReader,
 191               ROOT.class);
 192       ROOT root = rootElement.getValue();
 193
 194       /*
 195        * document root contains either "entry" or "entrySet"
 196        */
 197       if (root == null)
 198       {
 199         return entries;
 200       }
 201       if (root.getEntrySet() != null)
 202       {
 203         entries = root.getEntrySet().getEntry();
 204       }
 205       else if (root.getEntry() != null)
 206       {
 207         entries.add(root.getEntry());
 208       }
 209     } catch (JAXBException | XMLStreamException
 210             | FactoryConfigurationError e)
 211     {
 212       e.printStackTrace();
 213     }
 214     return entries;
 215   }
 216
 217   /**
 218    * A helper method to parse XML data and construct a sequence, with any
 219    * available database references and features
 220    *
 221    * @param emprefx
 222    * @param entry
 223    * @param peptides
 224    * @return
 225    */
 226   SequenceI getSequence(String sourceDb, EntryType entry,
 227           List<SequenceI> peptides)
 228   {
 229     String seqString = entry.getSequence();
 230     if (seqString == null)
 231     {
 232       return null;
 233     }
 234     seqString = seqString.replace(" ", "").replace("\n", "").replace("\t",
 235             "");
 236     String accession = entry.getAccession();
 237     SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString);
 238
 239     dna.setDescription(entry.getDescription());
 240     String sequenceVersion = String.valueOf(entry.getVersion().intValue());
 241     DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion,
 242             accession);
 243     dna.addDBRef(selfRref);
 244     selfRref.setMap(
 245             new Mapping(null, new int[]
 246             { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
 247                     1));
 248
 249     /*
 250      * add db references
 251      */
 252     List<XrefType> xrefs = entry.getXref();
 253     if (xrefs != null)
 254     {
 255       for (XrefType xref : xrefs)
 256       {
 257         String acc = xref.getId();
 258         String source = DBRefUtils.getCanonicalName(xref.getDb());
 259         String version = xref.getSecondaryId();
 260         if (version == null || "".equals(version))
 261         {
 262           version = "0";
 263         }
 264         dna.addDBRef(new DBRefEntry(source, version, acc));
 265       }
 266     }
 267
 268     SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
 269     try
 270     {
 271       List<Feature> features = entry.getFeature();
 272       if (features != null)
 273       {
 274         for (Feature feature : features)
 275         {
 276           if (FeatureProperties.isCodingFeature(sourceDb,
 277                   feature.getName()))
 278           {
 279             parseCodingFeature(entry, feature, sourceDb, dna, peptides,
 280                     matcher);
 281           }
 282         }
 283       }
 284     } catch (Exception e)
 285     {
 286       jalview.bin.Console.errPrintln("EMBL Record Features parsing error!");
 287       System.err
 288               .println("Please report the following to help@jalview.org :");
 289       jalview.bin.Console.errPrintln("EMBL Record " + accession);
 290       jalview.bin.Console
 291               .errPrintln("Resulted in exception: " + e.getMessage());
 292       e.printStackTrace(System.err);
 293     }
 294
 295     return dna;
 296   }
 297
 298   /**
 299    * Extracts coding region and product from a CDS feature and decorates it with
 300    * annotations
 301    *
 302    * @param entry
 303    * @param feature
 304    * @param sourceDb
 305    * @param dna
 306    * @param peptides
 307    * @param matcher
 308    */
 309   void parseCodingFeature(EntryType entry, Feature feature, String sourceDb,
 310           SequenceI dna, List<SequenceI> peptides,
 311           SequenceIdMatcher matcher)
 312   {
 313     final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
 314     final String accession = entry.getAccession();
 315     final String sequenceVersion = entry.getVersion().toString();
 316
 317     int[] exons = getCdsRanges(entry.getAccession(), feature);
 318
 319     String translation = null;
 320     String proteinName = "";
 321     String proteinId = null;
 322     Map<String, String> vals = new Hashtable<>();
 323
 324     /*
 325      * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
 326      * (phase is required for CDS features in GFF3 format)
 327      */
 328     int codonStart = 1;
 329
 330     /*
 331      * parse qualifiers, saving protein translation, protein id,
 332      * codon start position, product (name), and 'other values'
 333      */
 334     if (feature.getQualifier() != null)
 335     {
 336       for (Qualifier q : feature.getQualifier())
 337       {
 338         String qname = q.getName();
 339         String value = q.getValue();
 340         value = value == null ? ""
 341                 : value.trim().replace(" ", "").replace("\n", "")
 342                         .replace("\t", "");
 343         if (qname.equals("translation"))
 344         {
 345           translation = value;
 346         }
 347         else if (qname.equals("protein_id"))
 348         {
 349           proteinId = value;
 350         }
 351         else if (qname.equals("codon_start"))
 352         {
 353           try
 354           {
 355             codonStart = Integer.parseInt(value.trim());
 356           } catch (NumberFormatException e)
 357           {
 358             jalview.bin.Console.errPrintln("Invalid codon_start in XML for "
 359                     + entry.getAccession() + ": " + e.getMessage());
 360           }
 361         }
 362         else if (qname.equals("product"))
 363         {
 364           // sometimes name is returned e.g. for V00488
 365           proteinName = value;
 366         }
 367         else
 368         {
 369           // throw anything else into the additional properties hash
 370           if (!"".equals(value))
 371           {
 372             vals.put(qname, value);
 373           }
 374         }
 375       }
 376     }
 377
 378     DBRefEntry proteinToEmblProteinRef = null;
 379     exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
 380
 381     SequenceI product = null;
 382     Mapping dnaToProteinMapping = null;
 383     if (translation != null && proteinName != null && proteinId != null)
 384     {
 385       int translationLength = translation.length();
 386
 387       /*
 388        * look for product in peptides list, if not found, add it
 389        */
 390       product = matcher.findIdMatch(proteinId);
 391       if (product == null)
 392       {
 393         product = new Sequence(proteinId, translation, 1,
 394                 translationLength);
 395         product.setDescription(((proteinName.length() == 0)
 396                 ? "Protein Product from " + sourceDb
 397                 : proteinName));
 398         peptides.add(product);
 399         matcher.add(product);
 400       }
 401
 402       // we have everything - create the mapping and perhaps the protein
 403       // sequence
 404       if (exons == null || exons.length == 0)
 405       {
 406         /*
 407          * workaround until we handle dna location for CDS sequence
 408          * e.g. location="X53828.1:60..1058" correctly
 409          */
 410         jalview.bin.Console.errPrintln(
 411                 "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
 412                         + sourceDb + ":" + entry.getAccession() + ")");
 413         int dnaLength = dna.getLength();
 414         if (translationLength * 3 == (1 - codonStart + dnaLength))
 415         {
 416           jalview.bin.Console.errPrintln(
 417                   "Not allowing for additional stop codon at end of cDNA fragment... !");
 418           // this might occur for CDS sequences where no features are marked
 419           exons = new int[] { dna.getStart() + (codonStart - 1),
 420               dna.getEnd() };
 421           dnaToProteinMapping = new Mapping(product, exons,
 422                   new int[]
 423                   { 1, translationLength }, 3, 1);
 424         }
 425         if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
 426         {
 427           jalview.bin.Console.errPrintln(
 428                   "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
 429           exons = new int[] { dna.getStart() + (codonStart - 1),
 430               dna.getEnd() - 3 };
 431           dnaToProteinMapping = new Mapping(product, exons,
 432                   new int[]
 433                   { 1, translationLength }, 3, 1);
 434         }
 435       }
 436       else
 437       {
 438         // Trim the exon mapping if necessary - the given product may only be a
 439         // fragment of a larger protein. (EMBL:AY043181 is an example)
 440
 441         if (isEmblCdna)
 442         {
 443           // TODO: Add a DbRef back to the parent EMBL sequence with the exon
 444           // map
 445           // if given a dataset reference, search dataset for parent EMBL
 446           // sequence if it exists and set its map
 447           // make a new feature annotating the coding contig
 448         }
 449         else
 450         {
 451           // final product length truncation check
 452           int[] exons2 = adjustForProteinLength(translationLength, exons);
 453           dnaToProteinMapping = new Mapping(product, exons2,
 454                   new int[]
 455                   { 1, translationLength }, 3, 1);
 456           if (product != null)
 457           {
 458             /*
 459              * make xref with mapping from protein to EMBL dna
 460              */
 461             DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
 462                     sequenceVersion, proteinId,
 463                     new Mapping(dnaToProteinMapping.getMap().getInverse()));
 464             product.addDBRef(proteinToEmblRef);
 465
 466             /*
 467              * make xref from protein to EMBLCDS; we assume here that the
 468              * CDS sequence version is same as dna sequence (?!)
 469              */
 470             MapList proteinToCdsMapList = new MapList(
 471                     new int[]
 472                     { 1, translationLength },
 473                     new int[]
 474                     { 1 + (codonStart - 1),
 475                         (codonStart - 1) + 3 * translationLength },
 476                     1, 3);
 477             DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
 478                     DBRefSource.EMBLCDS, sequenceVersion, proteinId,
 479                     new Mapping(proteinToCdsMapList));
 480             product.addDBRef(proteinToEmblCdsRef);
 481
 482             /*
 483              * make 'direct' xref from protein to EMBLCDSPROTEIN
 484              */
 485             proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
 486             proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
 487             proteinToEmblProteinRef.setMap(null);
 488             product.addDBRef(proteinToEmblProteinRef);
 489           }
 490         }
 491       }
 492
 493       /*
 494        * add cds features to dna sequence
 495        */
 496       String cds = feature.getName(); // "CDS"
 497       for (int xint = 0; exons != null
 498               && xint < exons.length - 1; xint += 2)
 499       {
 500         int exonStart = exons[xint];
 501         int exonEnd = exons[xint + 1];
 502         int begin = Math.min(exonStart, exonEnd);
 503         int end = Math.max(exonStart, exonEnd);
 504         int exonNumber = xint / 2 + 1;
 505         String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
 506                 exonNumber, proteinName, proteinId);
 507
 508         SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb,
 509                 vals);
 510
 511         sf.setEnaLocation(feature.getLocation());
 512         boolean forwardStrand = exonStart <= exonEnd;
 513         sf.setStrand(forwardStrand ? "+" : "-");
 514         sf.setPhase(String.valueOf(codonStart - 1));
 515         sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 516         sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
 517
 518         dna.addSequenceFeature(sf);
 519       }
 520     }
 521
 522     /*
 523      * add feature dbRefs to sequence, and mappings for Uniprot xrefs
 524      */
 525     boolean hasUniprotDbref = false;
 526     List<XrefType> xrefs = feature.getXref();
 527     if (xrefs != null)
 528     {
 529       boolean mappingUsed = false;
 530       for (XrefType xref : xrefs)
 531       {
 532         /*
 533          * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 534          */
 535         String source = DBRefUtils.getCanonicalName(xref.getDb());
 536         String version = xref.getSecondaryId();
 537         if (version == null || "".equals(version))
 538         {
 539           version = "0";
 540         }
 541         DBRefEntry dbref = new DBRefEntry(source, version, xref.getId());
 542         DBRefEntry proteinDbRef = new DBRefEntry(source, version,
 543                 dbref.getAccessionId());
 544         if (source.equals(DBRefSource.UNIPROT))
 545         {
 546           String proteinSeqName = DBRefSource.UNIPROT + "|"
 547                   + dbref.getAccessionId();
 548           if (dnaToProteinMapping != null
 549                   && dnaToProteinMapping.getTo() != null)
 550           {
 551             if (mappingUsed)
 552             {
 553               /*
 554                * two or more Uniprot xrefs for the same CDS -
 555                * each needs a distinct Mapping (as to a different sequence)
 556                */
 557               dnaToProteinMapping = new Mapping(dnaToProteinMapping);
 558             }
 559             mappingUsed = true;
 560
 561             /*
 562              * try to locate the protein mapped to (possibly by a
 563              * previous CDS feature); if not found, construct it from
 564              * the EMBL translation
 565              */
 566             SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
 567             if (proteinSeq == null)
 568             {
 569               proteinSeq = new Sequence(proteinSeqName,
 570                       product.getSequenceAsString());
 571               matcher.add(proteinSeq);
 572               proteinSeq.setDescription(product.getDescription());
 573               peptides.add(proteinSeq);
 574             }
 575             dnaToProteinMapping.setTo(proteinSeq);
 576             dnaToProteinMapping.setMappedFromId(proteinId);
 577             proteinSeq.addDBRef(proteinDbRef);
 578             dbref.setMap(dnaToProteinMapping);
 579           }
 580           hasUniprotDbref = true;
 581         }
 582         if (product != null)
 583         {
 584           /*
 585            * copy feature dbref to our protein product
 586            */
 587           DBRefEntry pref = proteinDbRef;
 588           pref.setMap(null); // reference is direct
 589           product.addDBRef(pref);
 590           // Add converse mapping reference
 591           if (dnaToProteinMapping != null)
 592           {
 593             Mapping pmap = new Mapping(dna,
 594                     dnaToProteinMapping.getMap().getInverse());
 595             pref = new DBRefEntry(sourceDb, sequenceVersion, accession);
 596             pref.setMap(pmap);
 597             if (dnaToProteinMapping.getTo() != null)
 598             {
 599               dnaToProteinMapping.getTo().addDBRef(pref);
 600             }
 601           }
 602         }
 603         dna.addDBRef(dbref);
 604       }
 605     }
 606
 607     /*
 608      * if we have a product (translation) but no explicit Uniprot dbref
 609      * (example: EMBL AAFI02000057 protein_id EAL65544.1)
 610      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 611      */
 612     if (!hasUniprotDbref && product != null)
 613     {
 614       if (proteinToEmblProteinRef == null)
 615       {
 616         // assuming CDSPROTEIN sequence version = dna version (?!)
 617         proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 618                 sequenceVersion, proteinId);
 619       }
 620       product.addDBRef(proteinToEmblProteinRef);
 621
 622       if (dnaToProteinMapping != null
 623               && dnaToProteinMapping.getTo() != null)
 624       {
 625         DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 626                 DBRefSource.EMBLCDSProduct, sequenceVersion, proteinId);
 627         dnaToEmblProteinRef.setMap(dnaToProteinMapping);
 628         dnaToProteinMapping.setMappedFromId(proteinId);
 629         dna.addDBRef(dnaToEmblProteinRef);
 630       }
 631     }
 632   }
 633
 634   @Override
 635   public boolean isDnaCoding()
 636   {
 637     return true;
 638   }
 639
 640   /**
 641    * Returns the CDS positions as a single array of [start, end, start, end...]
 642    * positions. If on the reverse strand, these will be in descending order.
 643    *
 644    * @param accession
 645    * @param feature
 646    * @return
 647    */
 648   protected int[] getCdsRanges(String accession, Feature feature)
 649   {
 650     String location = feature.getLocation();
 651     if (location == null)
 652     {
 653       return new int[] {};
 654     }
 655
 656     try
 657     {
 658       List<int[]> ranges = DnaUtils.parseLocation(location);
 659       return listToArray(ranges);
 660     } catch (ParseException e)
 661     {
 662       Console.warn(
 663               String.format("Not parsing inexact CDS location %s in ENA %s",
 664                       location, accession));
 665       return new int[] {};
 666     }
 667   }
 668
 669   /**
 670    * Converts a list of [start, end] ranges to a single array of [start, end,
 671    * start, end ...]
 672    *
 673    * @param ranges
 674    * @return
 675    */
 676   int[] listToArray(List<int[]> ranges)
 677   {
 678     int[] result = new int[ranges.size() * 2];
 679     int i = 0;
 680     for (int[] range : ranges)
 681     {
 682       result[i++] = range[0];
 683       result[i++] = range[1];
 684     }
 685     return result;
 686   }
 687
 688   /**
 689    * Helper method to construct a SequenceFeature for one cds range
 690    *
 691    * @param type
 692    *          feature type ("CDS")
 693    * @param desc
 694    *          description
 695    * @param begin
 696    *          start position
 697    * @param end
 698    *          end position
 699    * @param group
 700    *          feature group
 701    * @param vals
 702    *          map of 'miscellaneous values' for feature
 703    * @return
 704    */
 705   protected SequenceFeature makeCdsFeature(String type, String desc,
 706           int begin, int end, String group, Map<String, String> vals)
 707   {
 708     SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
 709     if (!vals.isEmpty())
 710     {
 711       for (Entry<String, String> val : vals.entrySet())
 712       {
 713         sf.setValue(val.getKey(), val.getValue());
 714       }
 715     }
 716     return sf;
 717   }
 718
 719   @Override
 720   public String getAccessionSeparator()
 721   {
 722     return null;
 723   }
 724
 725   @Override
 726   public Regex getAccessionValidator()
 727   {
 728     return ACCESSION_REGEX;
 729   }
 730
 731   @Override
 732   public String getDbVersion()
 733   {
 734     return "0";
 735   }
 736
 737   @Override
 738   public int getTier()
 739   {
 740     return 0;
 741   }
 742
 743   @Override
 744   public boolean isValidReference(String accession)
 745   {
 746     if (accession == null || accession.length() < 2)
 747     {
 748       return false;
 749     }
 750     return getAccessionValidator().search(accession);
 751   }
 752
 753   /**
 754    * Truncates (if necessary) the exon intervals to match 3 times the length of
 755    * the protein (including truncation for stop codon included in exon)
 756    *
 757    * @param proteinLength
 758    * @param exon
 759    *          an array of [start, end, start, end...] intervals
 760    * @return the same array (if unchanged) or a truncated copy
 761    */
 762   static int[] adjustForProteinLength(int proteinLength, int[] exon)
 763   {
 764     if (proteinLength <= 0 || exon == null)
 765     {
 766       return exon;
 767     }
 768     int expectedCdsLength = proteinLength * 3;
 769     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 770
 771     /*
 772      * if exon length matches protein, or is shorter, then leave it unchanged
 773      */
 774     if (expectedCdsLength >= exonLength)
 775     {
 776       return exon;
 777     }
 778
 779     int origxon[];
 780     int sxpos = -1;
 781     int endxon = 0;
 782     origxon = new int[exon.length];
 783     System.arraycopy(exon, 0, origxon, 0, exon.length);
 784     int cdspos = 0;
 785     for (int x = 0; x < exon.length; x += 2)
 786     {
 787       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 788       if (expectedCdsLength <= cdspos)
 789       {
 790         // advanced beyond last codon.
 791         sxpos = x;
 792         if (expectedCdsLength != cdspos)
 793         {
 794           // System.err
 795           // .println("Truncating final exon interval on region by "
 796           // + (cdspos - cdslength));
 797         }
 798
 799         /*
 800          * shrink the final exon - reduce end position if forward
 801          * strand, increase it if reverse
 802          */
 803         if (exon[x + 1] >= exon[x])
 804         {
 805           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 806         }
 807         else
 808         {
 809           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 810         }
 811         break;
 812       }
 813     }
 814
 815     if (sxpos != -1)
 816     {
 817       // and trim the exon interval set if necessary
 818       int[] nxon = new int[sxpos + 2];
 819       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 820       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 821                                 // set
 822       exon = nxon;
 823     }
 824     return exon;
 825   }
 826
 827 }