src/jalview/ws/dbsources/EmblXmlSource.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.ws.dbsources;
  22
  23 import java.io.File;
  24 import java.io.FileInputStream;
  25 import java.io.InputStream;
  26 import java.text.ParseException;
  27 import java.util.ArrayList;
  28 import java.util.Arrays;
  29 import java.util.Hashtable;
  30 import java.util.List;
  31 import java.util.Locale;
  32 import java.util.Map;
  33 import java.util.Map.Entry;
  34
  35 import javax.xml.bind.JAXBContext;
  36 import javax.xml.bind.JAXBElement;
  37 import javax.xml.bind.JAXBException;
  38 import javax.xml.stream.FactoryConfigurationError;
  39 import javax.xml.stream.XMLInputFactory;
  40 import javax.xml.stream.XMLStreamException;
  41 import javax.xml.stream.XMLStreamReader;
  42
  43 import com.stevesoft.pat.Regex;
  44
  45 import jalview.analysis.SequenceIdMatcher;
  46 import jalview.bin.Console;
  47 import jalview.datamodel.Alignment;
  48 import jalview.datamodel.AlignmentI;
  49 import jalview.datamodel.DBRefEntry;
  50 import jalview.datamodel.DBRefSource;
  51 import jalview.datamodel.FeatureProperties;
  52 import jalview.datamodel.Mapping;
  53 import jalview.datamodel.Sequence;
  54 import jalview.datamodel.SequenceFeature;
  55 import jalview.datamodel.SequenceI;
  56 import jalview.util.DBRefUtils;
  57 import jalview.util.DnaUtils;
  58 import jalview.util.MapList;
  59 import jalview.util.MappingUtils;
  60 import jalview.ws.ebi.EBIFetchClient;
  61 import jalview.xml.binding.embl.EntryType;
  62 import jalview.xml.binding.embl.EntryType.Feature;
  63 import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
  64 import jalview.xml.binding.embl.ROOT;
  65 import jalview.xml.binding.embl.XrefType;
  66
  67 public abstract class EmblXmlSource extends EbiFileRetrievedProxy
  68 {
  69   private static final Regex ACCESSION_REGEX = new Regex("^[A-Z]+[0-9]+");
  70
  71   /*
  72    * JAL-1856 Embl returns this text for query not found
  73    */
  74   private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found.";
  75
  76   public EmblXmlSource()
  77   {
  78     super();
  79   }
  80
  81   /**
  82    * Retrieves and parses an emblxml file, and returns an alignment containing
  83    * the parsed sequences, or null if none were found
  84    *
  85    * @param emprefx
  86    *          "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml
  87    * @param query
  88    * @return
  89    * @throws Exception
  90    */
  91   protected AlignmentI getEmblSequenceRecords(String emprefx, String query)
  92           throws Exception
  93   {
  94     startQuery();
  95     EBIFetchClient dbFetch = new EBIFetchClient();
  96     File reply;
  97     try
  98     {
  99       reply = dbFetch.fetchDataAsFile(
 100               emprefx.toLowerCase(Locale.ROOT) + ":" + query.trim(),
 101               "display=xml", "xml");
 102     } catch (Exception e)
 103     {
 104       stopQuery();
 105       throw new Exception(
 106               String.format("EBI EMBL XML retrieval failed for %s:%s",
 107                       emprefx.toLowerCase(Locale.ROOT), query.trim()),
 108               e);
 109     }
 110     return getEmblSequenceRecords(emprefx, query, reply);
 111   }
 112
 113   /**
 114    * parse an emblxml file stored locally
 115    *
 116    * @param emprefx
 117    *          either EMBL or EMBLCDS strings are allowed - anything else will
 118    *          not retrieve emblxml
 119    * @param query
 120    * @param file
 121    *          the EMBL XML file containing the results of a query
 122    * @return
 123    * @throws Exception
 124    */
 125   protected AlignmentI getEmblSequenceRecords(String emprefx, String query,
 126           File reply) throws Exception
 127   {
 128     List<EntryType> entries = null;
 129     if (reply != null && reply.exists())
 130     {
 131       file = reply.getAbsolutePath();
 132       if (reply.length() > EMBL_NOT_FOUND_REPLY.length())
 133       {
 134         InputStream is = new FileInputStream(reply);
 135         entries = getEmblEntries(is);
 136       }
 137     }
 138
 139     /*
 140      * invalid accession gets a reply with no <entry> elements, text content of
 141      * EmbFile reads something like (e.g.) this ungrammatical phrase
 142      * Entry: <acc> display type is either not supported or entry is not found.
 143      */
 144     AlignmentI al = null;
 145     List<SequenceI> seqs = new ArrayList<>();
 146     List<SequenceI> peptides = new ArrayList<>();
 147     if (entries != null)
 148     {
 149       for (EntryType entry : entries)
 150       {
 151         SequenceI seq = getSequence(emprefx, entry, peptides);
 152         if (seq != null)
 153         {
 154           seqs.add(seq.deriveSequence());
 155           // place DBReferences on dataset and refer
 156         }
 157       }
 158       if (!seqs.isEmpty())
 159       {
 160         al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
 161       }
 162       else
 163       {
 164         jalview.bin.Console.outPrintln(
 165                 "No record found for '" + emprefx + ":" + query + "'");
 166       }
 167     }
 168
 169     stopQuery();
 170     return al;
 171   }
 172
 173   /**
 174    * Reads the XML reply from file and unmarshals it to Java objects. Answers a
 175    * (possibly empty) list of <code>EntryType</code> objects.
 176    *
 177    * is
 178    *
 179    * @return
 180    */
 181   List<EntryType> getEmblEntries(InputStream is)
 182   {
 183     List<EntryType> entries = new ArrayList<>();
 184     try
 185     {
 186       JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl");
 187       XMLStreamReader streamReader = XMLInputFactory.newInstance()
 188               .createXMLStreamReader(is);
 189       javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
 190       JAXBElement<ROOT> rootElement = um.unmarshal(streamReader,
 191               ROOT.class);
 192       ROOT root = rootElement.getValue();
 193
 194       /*
 195        * document root contains either "entry" or "entrySet"
 196        */
 197       if (root == null)
 198       {
 199         return entries;
 200       }
 201       if (root.getEntrySet() != null)
 202       {
 203         entries = root.getEntrySet().getEntry();
 204       }
 205       else if (root.getEntry() != null)
 206       {
 207         entries.add(root.getEntry());
 208       }
 209     } catch (JAXBException | XMLStreamException
 210             | FactoryConfigurationError e)
 211     {
 212       e.printStackTrace();
 213     }
 214     return entries;
 215   }
 216
 217   /**
 218    * A helper method to parse XML data and construct a sequence, with any
 219    * available database references and features
 220    *
 221    * @param emprefx
 222    * @param entry
 223    * @param peptides
 224    * @return
 225    */
 226   SequenceI getSequence(String sourceDb, EntryType entry,
 227           List<SequenceI> peptides)
 228   {
 229     String seqString = entry.getSequence();
 230     if (seqString == null)
 231     {
 232       return null;
 233     }
 234     seqString = seqString.replace(" ", "").replace("\n", "").replace("\t",
 235             "");
 236     String accession = entry.getAccession();
 237     SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString);
 238
 239     dna.setDescription(entry.getDescription());
 240     String sequenceVersion = String.valueOf(entry.getVersion().intValue());
 241     DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion,
 242             accession);
 243     dna.addDBRef(selfRref);
 244     selfRref.setMap(
 245             new Mapping(null, new int[]
 246             { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
 247                     1));
 248
 249     /*
 250      * add db references
 251      */
 252     List<XrefType> xrefs = entry.getXref();
 253     if (xrefs != null)
 254     {
 255       for (XrefType xref : xrefs)
 256       {
 257         String acc = xref.getId();
 258         String source = DBRefUtils.getCanonicalName(xref.getDb());
 259         String version = xref.getSecondaryId();
 260         if (version == null || "".equals(version))
 261         {
 262           version = "0";
 263         }
 264         dna.addDBRef(new DBRefEntry(source, version, acc));
 265       }
 266     }
 267
 268     SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
 269     try
 270     {
 271       List<Feature> features = entry.getFeature();
 272       if (features != null)
 273       {
 274         for (Feature feature : features)
 275         {
 276           if (FeatureProperties.isCodingFeature(sourceDb,
 277                   feature.getName()))
 278           {
 279             parseCodingFeature(entry, feature, sourceDb, dna, peptides,
 280                     matcher);
 281           }
 282         }
 283       }
 284     } catch (Exception e)
 285     {
 286       jalview.bin.Console.errPrintln("EMBL Record Features parsing error!");
 287       System.err
 288               .println("Please report the following to help@jalview.org :");
 289       jalview.bin.Console.errPrintln("EMBL Record " + accession);
 290       jalview.bin.Console.errPrintln("Resulted in exception: " + e.getMessage());
 291       e.printStackTrace(System.err);
 292     }
 293
 294     return dna;
 295   }
 296
 297   /**
 298    * Extracts coding region and product from a CDS feature and decorates it with
 299    * annotations
 300    *
 301    * @param entry
 302    * @param feature
 303    * @param sourceDb
 304    * @param dna
 305    * @param peptides
 306    * @param matcher
 307    */
 308   void parseCodingFeature(EntryType entry, Feature feature, String sourceDb,
 309           SequenceI dna, List<SequenceI> peptides,
 310           SequenceIdMatcher matcher)
 311   {
 312     final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
 313     final String accession = entry.getAccession();
 314     final String sequenceVersion = entry.getVersion().toString();
 315
 316     int[] exons = getCdsRanges(entry.getAccession(), feature);
 317
 318     String translation = null;
 319     String proteinName = "";
 320     String proteinId = null;
 321     Map<String, String> vals = new Hashtable<>();
 322
 323     /*
 324      * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
 325      * (phase is required for CDS features in GFF3 format)
 326      */
 327     int codonStart = 1;
 328
 329     /*
 330      * parse qualifiers, saving protein translation, protein id,
 331      * codon start position, product (name), and 'other values'
 332      */
 333     if (feature.getQualifier() != null)
 334     {
 335       for (Qualifier q : feature.getQualifier())
 336       {
 337         String qname = q.getName();
 338         String value = q.getValue();
 339         value = value == null ? ""
 340                 : value.trim().replace(" ", "").replace("\n", "")
 341                         .replace("\t", "");
 342         if (qname.equals("translation"))
 343         {
 344           translation = value;
 345         }
 346         else if (qname.equals("protein_id"))
 347         {
 348           proteinId = value;
 349         }
 350         else if (qname.equals("codon_start"))
 351         {
 352           try
 353           {
 354             codonStart = Integer.parseInt(value.trim());
 355           } catch (NumberFormatException e)
 356           {
 357             jalview.bin.Console.errPrintln("Invalid codon_start in XML for "
 358                     + entry.getAccession() + ": " + e.getMessage());
 359           }
 360         }
 361         else if (qname.equals("product"))
 362         {
 363           // sometimes name is returned e.g. for V00488
 364           proteinName = value;
 365         }
 366         else
 367         {
 368           // throw anything else into the additional properties hash
 369           if (!"".equals(value))
 370           {
 371             vals.put(qname, value);
 372           }
 373         }
 374       }
 375     }
 376
 377     DBRefEntry proteinToEmblProteinRef = null;
 378     exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
 379
 380     SequenceI product = null;
 381     Mapping dnaToProteinMapping = null;
 382     if (translation != null && proteinName != null && proteinId != null)
 383     {
 384       int translationLength = translation.length();
 385
 386       /*
 387        * look for product in peptides list, if not found, add it
 388        */
 389       product = matcher.findIdMatch(proteinId);
 390       if (product == null)
 391       {
 392         product = new Sequence(proteinId, translation, 1,
 393                 translationLength);
 394         product.setDescription(((proteinName.length() == 0)
 395                 ? "Protein Product from " + sourceDb
 396                 : proteinName));
 397         peptides.add(product);
 398         matcher.add(product);
 399       }
 400
 401       // we have everything - create the mapping and perhaps the protein
 402       // sequence
 403       if (exons == null || exons.length == 0)
 404       {
 405         /*
 406          * workaround until we handle dna location for CDS sequence
 407          * e.g. location="X53828.1:60..1058" correctly
 408          */
 409         jalview.bin.Console.errPrintln(
 410                 "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
 411                         + sourceDb + ":" + entry.getAccession() + ")");
 412         int dnaLength = dna.getLength();
 413         if (translationLength * 3 == (1 - codonStart + dnaLength))
 414         {
 415           jalview.bin.Console.errPrintln(
 416                   "Not allowing for additional stop codon at end of cDNA fragment... !");
 417           // this might occur for CDS sequences where no features are marked
 418           exons = new int[] { dna.getStart() + (codonStart - 1),
 419               dna.getEnd() };
 420           dnaToProteinMapping = new Mapping(product, exons,
 421                   new int[]
 422                   { 1, translationLength }, 3, 1);
 423         }
 424         if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
 425         {
 426           jalview.bin.Console.errPrintln(
 427                   "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
 428           exons = new int[] { dna.getStart() + (codonStart - 1),
 429               dna.getEnd() - 3 };
 430           dnaToProteinMapping = new Mapping(product, exons,
 431                   new int[]
 432                   { 1, translationLength }, 3, 1);
 433         }
 434       }
 435       else
 436       {
 437         // Trim the exon mapping if necessary - the given product may only be a
 438         // fragment of a larger protein. (EMBL:AY043181 is an example)
 439
 440         if (isEmblCdna)
 441         {
 442           // TODO: Add a DbRef back to the parent EMBL sequence with the exon
 443           // map
 444           // if given a dataset reference, search dataset for parent EMBL
 445           // sequence if it exists and set its map
 446           // make a new feature annotating the coding contig
 447         }
 448         else
 449         {
 450           // final product length truncation check
 451           int[] exons2 = adjustForProteinLength(translationLength, exons);
 452           dnaToProteinMapping = new Mapping(product, exons2,
 453                   new int[]
 454                   { 1, translationLength }, 3, 1);
 455           if (product != null)
 456           {
 457             /*
 458              * make xref with mapping from protein to EMBL dna
 459              */
 460             DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
 461                     sequenceVersion, proteinId,
 462                     new Mapping(dnaToProteinMapping.getMap().getInverse()));
 463             product.addDBRef(proteinToEmblRef);
 464
 465             /*
 466              * make xref from protein to EMBLCDS; we assume here that the
 467              * CDS sequence version is same as dna sequence (?!)
 468              */
 469             MapList proteinToCdsMapList = new MapList(
 470                     new int[]
 471                     { 1, translationLength },
 472                     new int[]
 473                     { 1 + (codonStart - 1),
 474                         (codonStart - 1) + 3 * translationLength },
 475                     1, 3);
 476             DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
 477                     DBRefSource.EMBLCDS, sequenceVersion, proteinId,
 478                     new Mapping(proteinToCdsMapList));
 479             product.addDBRef(proteinToEmblCdsRef);
 480
 481             /*
 482              * make 'direct' xref from protein to EMBLCDSPROTEIN
 483              */
 484             proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
 485             proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
 486             proteinToEmblProteinRef.setMap(null);
 487             product.addDBRef(proteinToEmblProteinRef);
 488           }
 489         }
 490       }
 491
 492       /*
 493        * add cds features to dna sequence
 494        */
 495       String cds = feature.getName(); // "CDS"
 496       for (int xint = 0; exons != null
 497               && xint < exons.length - 1; xint += 2)
 498       {
 499         int exonStart = exons[xint];
 500         int exonEnd = exons[xint + 1];
 501         int begin = Math.min(exonStart, exonEnd);
 502         int end = Math.max(exonStart, exonEnd);
 503         int exonNumber = xint / 2 + 1;
 504         String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
 505                 exonNumber, proteinName, proteinId);
 506
 507         SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb,
 508                 vals);
 509
 510         sf.setEnaLocation(feature.getLocation());
 511         boolean forwardStrand = exonStart <= exonEnd;
 512         sf.setStrand(forwardStrand ? "+" : "-");
 513         sf.setPhase(String.valueOf(codonStart - 1));
 514         sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 515         sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
 516
 517         dna.addSequenceFeature(sf);
 518       }
 519     }
 520
 521     /*
 522      * add feature dbRefs to sequence, and mappings for Uniprot xrefs
 523      */
 524     boolean hasUniprotDbref = false;
 525     List<XrefType> xrefs = feature.getXref();
 526     if (xrefs != null)
 527     {
 528       boolean mappingUsed = false;
 529       for (XrefType xref : xrefs)
 530       {
 531         /*
 532          * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 533          */
 534         String source = DBRefUtils.getCanonicalName(xref.getDb());
 535         String version = xref.getSecondaryId();
 536         if (version == null || "".equals(version))
 537         {
 538           version = "0";
 539         }
 540         DBRefEntry dbref = new DBRefEntry(source, version, xref.getId());
 541         DBRefEntry proteinDbRef = new DBRefEntry(source, version,
 542                 dbref.getAccessionId());
 543         if (source.equals(DBRefSource.UNIPROT))
 544         {
 545           String proteinSeqName = DBRefSource.UNIPROT + "|"
 546                   + dbref.getAccessionId();
 547           if (dnaToProteinMapping != null
 548                   && dnaToProteinMapping.getTo() != null)
 549           {
 550             if (mappingUsed)
 551             {
 552               /*
 553                * two or more Uniprot xrefs for the same CDS -
 554                * each needs a distinct Mapping (as to a different sequence)
 555                */
 556               dnaToProteinMapping = new Mapping(dnaToProteinMapping);
 557             }
 558             mappingUsed = true;
 559
 560             /*
 561              * try to locate the protein mapped to (possibly by a
 562              * previous CDS feature); if not found, construct it from
 563              * the EMBL translation
 564              */
 565             SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
 566             if (proteinSeq == null)
 567             {
 568               proteinSeq = new Sequence(proteinSeqName,
 569                       product.getSequenceAsString());
 570               matcher.add(proteinSeq);
 571               proteinSeq.setDescription(product.getDescription());
 572               peptides.add(proteinSeq);
 573             }
 574             dnaToProteinMapping.setTo(proteinSeq);
 575             dnaToProteinMapping.setMappedFromId(proteinId);
 576             proteinSeq.addDBRef(proteinDbRef);
 577             dbref.setMap(dnaToProteinMapping);
 578           }
 579           hasUniprotDbref = true;
 580         }
 581         if (product != null)
 582         {
 583           /*
 584            * copy feature dbref to our protein product
 585            */
 586           DBRefEntry pref = proteinDbRef;
 587           pref.setMap(null); // reference is direct
 588           product.addDBRef(pref);
 589           // Add converse mapping reference
 590           if (dnaToProteinMapping != null)
 591           {
 592             Mapping pmap = new Mapping(dna,
 593                     dnaToProteinMapping.getMap().getInverse());
 594             pref = new DBRefEntry(sourceDb, sequenceVersion, accession);
 595             pref.setMap(pmap);
 596             if (dnaToProteinMapping.getTo() != null)
 597             {
 598               dnaToProteinMapping.getTo().addDBRef(pref);
 599             }
 600           }
 601         }
 602         dna.addDBRef(dbref);
 603       }
 604     }
 605
 606     /*
 607      * if we have a product (translation) but no explicit Uniprot dbref
 608      * (example: EMBL AAFI02000057 protein_id EAL65544.1)
 609      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 610      */
 611     if (!hasUniprotDbref && product != null)
 612     {
 613       if (proteinToEmblProteinRef == null)
 614       {
 615         // assuming CDSPROTEIN sequence version = dna version (?!)
 616         proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 617                 sequenceVersion, proteinId);
 618       }
 619       product.addDBRef(proteinToEmblProteinRef);
 620
 621       if (dnaToProteinMapping != null
 622               && dnaToProteinMapping.getTo() != null)
 623       {
 624         DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 625                 DBRefSource.EMBLCDSProduct, sequenceVersion, proteinId);
 626         dnaToEmblProteinRef.setMap(dnaToProteinMapping);
 627         dnaToProteinMapping.setMappedFromId(proteinId);
 628         dna.addDBRef(dnaToEmblProteinRef);
 629       }
 630     }
 631   }
 632
 633   @Override
 634   public boolean isDnaCoding()
 635   {
 636     return true;
 637   }
 638
 639   /**
 640    * Returns the CDS positions as a single array of [start, end, start, end...]
 641    * positions. If on the reverse strand, these will be in descending order.
 642    *
 643    * @param accession
 644    * @param feature
 645    * @return
 646    */
 647   protected int[] getCdsRanges(String accession, Feature feature)
 648   {
 649     String location = feature.getLocation();
 650     if (location == null)
 651     {
 652       return new int[] {};
 653     }
 654
 655     try
 656     {
 657       List<int[]> ranges = DnaUtils.parseLocation(location);
 658       return listToArray(ranges);
 659     } catch (ParseException e)
 660     {
 661       Console.warn(
 662               String.format("Not parsing inexact CDS location %s in ENA %s",
 663                       location, accession));
 664       return new int[] {};
 665     }
 666   }
 667
 668   /**
 669    * Converts a list of [start, end] ranges to a single array of [start, end,
 670    * start, end ...]
 671    *
 672    * @param ranges
 673    * @return
 674    */
 675   int[] listToArray(List<int[]> ranges)
 676   {
 677     int[] result = new int[ranges.size() * 2];
 678     int i = 0;
 679     for (int[] range : ranges)
 680     {
 681       result[i++] = range[0];
 682       result[i++] = range[1];
 683     }
 684     return result;
 685   }
 686
 687   /**
 688    * Helper method to construct a SequenceFeature for one cds range
 689    *
 690    * @param type
 691    *          feature type ("CDS")
 692    * @param desc
 693    *          description
 694    * @param begin
 695    *          start position
 696    * @param end
 697    *          end position
 698    * @param group
 699    *          feature group
 700    * @param vals
 701    *          map of 'miscellaneous values' for feature
 702    * @return
 703    */
 704   protected SequenceFeature makeCdsFeature(String type, String desc,
 705           int begin, int end, String group, Map<String, String> vals)
 706   {
 707     SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
 708     if (!vals.isEmpty())
 709     {
 710       for (Entry<String, String> val : vals.entrySet())
 711       {
 712         sf.setValue(val.getKey(), val.getValue());
 713       }
 714     }
 715     return sf;
 716   }
 717
 718   @Override
 719   public String getAccessionSeparator()
 720   {
 721     return null;
 722   }
 723
 724   @Override
 725   public Regex getAccessionValidator()
 726   {
 727     return ACCESSION_REGEX;
 728   }
 729
 730   @Override
 731   public String getDbVersion()
 732   {
 733     return "0";
 734   }
 735
 736   @Override
 737   public int getTier()
 738   {
 739     return 0;
 740   }
 741
 742   @Override
 743   public boolean isValidReference(String accession)
 744   {
 745     if (accession == null || accession.length() < 2)
 746     {
 747       return false;
 748     }
 749     return getAccessionValidator().search(accession);
 750   }
 751
 752   /**
 753    * Truncates (if necessary) the exon intervals to match 3 times the length of
 754    * the protein (including truncation for stop codon included in exon)
 755    *
 756    * @param proteinLength
 757    * @param exon
 758    *          an array of [start, end, start, end...] intervals
 759    * @return the same array (if unchanged) or a truncated copy
 760    */
 761   static int[] adjustForProteinLength(int proteinLength, int[] exon)
 762   {
 763     if (proteinLength <= 0 || exon == null)
 764     {
 765       return exon;
 766     }
 767     int expectedCdsLength = proteinLength * 3;
 768     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 769
 770     /*
 771      * if exon length matches protein, or is shorter, then leave it unchanged
 772      */
 773     if (expectedCdsLength >= exonLength)
 774     {
 775       return exon;
 776     }
 777
 778     int origxon[];
 779     int sxpos = -1;
 780     int endxon = 0;
 781     origxon = new int[exon.length];
 782     System.arraycopy(exon, 0, origxon, 0, exon.length);
 783     int cdspos = 0;
 784     for (int x = 0; x < exon.length; x += 2)
 785     {
 786       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 787       if (expectedCdsLength <= cdspos)
 788       {
 789         // advanced beyond last codon.
 790         sxpos = x;
 791         if (expectedCdsLength != cdspos)
 792         {
 793           // System.err
 794           // .println("Truncating final exon interval on region by "
 795           // + (cdspos - cdslength));
 796         }
 797
 798         /*
 799          * shrink the final exon - reduce end position if forward
 800          * strand, increase it if reverse
 801          */
 802         if (exon[x + 1] >= exon[x])
 803         {
 804           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 805         }
 806         else
 807         {
 808           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 809         }
 810         break;
 811       }
 812     }
 813
 814     if (sxpos != -1)
 815     {
 816       // and trim the exon interval set if necessary
 817       int[] nxon = new int[sxpos + 2];
 818       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 819       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 820                                 // set
 821       exon = nxon;
 822     }
 823     return exon;
 824   }
 825
 826 }