src/jalview/ws/dbsources/EmblXmlSource.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.ws.dbsources;
  22
  23 import java.io.File;
  24 import java.io.FileInputStream;
  25 import java.io.InputStream;
  26 import java.text.ParseException;
  27 import java.util.ArrayList;
  28 import java.util.Arrays;
  29 import java.util.Hashtable;
  30 import java.util.List;
  31 import java.util.Map;
  32 import java.util.Map.Entry;
  33
  34 import javax.xml.bind.JAXBContext;
  35 import javax.xml.bind.JAXBElement;
  36 import javax.xml.bind.JAXBException;
  37 import javax.xml.stream.FactoryConfigurationError;
  38 import javax.xml.stream.XMLInputFactory;
  39 import javax.xml.stream.XMLStreamException;
  40 import javax.xml.stream.XMLStreamReader;
  41
  42 import com.stevesoft.pat.Regex;
  43
  44 import jalview.analysis.SequenceIdMatcher;
  45 import jalview.bin.Cache;
  46 import jalview.datamodel.Alignment;
  47 import jalview.datamodel.AlignmentI;
  48 import jalview.datamodel.DBRefEntry;
  49 import jalview.datamodel.DBRefSource;
  50 import jalview.datamodel.FeatureProperties;
  51 import jalview.datamodel.Mapping;
  52 import jalview.datamodel.Sequence;
  53 import jalview.datamodel.SequenceFeature;
  54 import jalview.datamodel.SequenceI;
  55 import jalview.util.DBRefUtils;
  56 import jalview.util.DnaUtils;
  57 import jalview.util.MapList;
  58 import jalview.util.MappingUtils;
  59 import jalview.ws.ebi.EBIFetchClient;
  60 import jalview.xml.binding.embl.EntryType;
  61 import jalview.xml.binding.embl.EntryType.Feature;
  62 import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
  63 import jalview.xml.binding.embl.ROOT;
  64 import jalview.xml.binding.embl.XrefType;
  65
  66 /**
  67  * Provides XML binding and parsing of EMBL or EMBLCDS records retrieved from
  68  * (e.g.) {@code https://www.ebi.ac.uk/ena/data/view/x53828&display=xml}.
  69  *
  70  * @deprecated endpoint withdrawn August 2020 (JAL-3692), use EmblFlatfileSource
  71  */
  72 public abstract class EmblXmlSource extends EbiFileRetrievedProxy
  73 {
  74   private static final Regex ACCESSION_REGEX = new Regex("^[A-Z]+[0-9]+");
  75
  76   /*
  77    * JAL-1856 Embl returns this text for query not found
  78    */
  79   private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found.";
  80
  81   public EmblXmlSource()
  82   {
  83     super();
  84   }
  85
  86   /**
  87    * Retrieves and parses an emblxml file, and returns an alignment containing
  88    * the parsed sequences, or null if none were found
  89    *
  90    * @param emprefx
  91    *          "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml
  92    * @param query
  93    * @return
  94    * @throws Exception
  95    */
  96   protected AlignmentI getEmblSequenceRecords(String emprefx, String query)
  97           throws Exception
  98   {
  99     startQuery();
 100     EBIFetchClient dbFetch = new EBIFetchClient();
 101     File reply;
 102     try
 103     {
 104       reply = dbFetch.fetchDataAsFile(
 105               emprefx.toLowerCase() + ":" + query.trim(), "display=xml",
 106               "xml");
 107     } catch (Exception e)
 108     {
 109       stopQuery();
 110       throw new Exception(
 111               String.format("EBI EMBL XML retrieval failed for %s:%s",
 112                       emprefx.toLowerCase(), query.trim()),
 113               e);
 114     }
 115     return getEmblSequenceRecords(emprefx, query, reply);
 116   }
 117
 118   /**
 119    * parse an emblxml file stored locally
 120    *
 121    * @param emprefx
 122    *          either EMBL or EMBLCDS strings are allowed - anything else will
 123    *          not retrieve emblxml
 124    * @param query
 125    * @param file
 126    *          the EMBL XML file containing the results of a query
 127    * @return
 128    * @throws Exception
 129    */
 130   protected AlignmentI getEmblSequenceRecords(String emprefx, String query,
 131           File reply) throws Exception
 132   {
 133     List<EntryType> entries = null;
 134     if (reply != null && reply.exists())
 135     {
 136       file = reply.getAbsolutePath();
 137       if (reply.length() > EMBL_NOT_FOUND_REPLY.length())
 138       {
 139         InputStream is = new FileInputStream(reply);
 140         entries = getEmblEntries(is);
 141       }
 142     }
 143
 144     /*
 145      * invalid accession gets a reply with no <entry> elements, text content of
 146      * EmbFile reads something like (e.g.) this ungrammatical phrase
 147      * Entry: <acc> display type is either not supported or entry is not found.
 148      */
 149     AlignmentI al = null;
 150     List<SequenceI> seqs = new ArrayList<>();
 151     List<SequenceI> peptides = new ArrayList<>();
 152     if (entries != null)
 153     {
 154       for (EntryType entry : entries)
 155       {
 156         SequenceI seq = getSequence(emprefx, entry, peptides);
 157         if (seq != null)
 158         {
 159           seqs.add(seq.deriveSequence());
 160           // place DBReferences on dataset and refer
 161         }
 162       }
 163       if (!seqs.isEmpty())
 164       {
 165         al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
 166       }
 167       else
 168       {
 169         System.out.println(
 170                 "No record found for '" + emprefx + ":" + query + "'");
 171       }
 172     }
 173
 174     stopQuery();
 175     return al;
 176   }
 177
 178   /**
 179    * Reads the XML reply from file and unmarshals it to Java objects. Answers a
 180    * (possibly empty) list of <code>EntryType</code> objects.
 181    *
 182    * is
 183    *
 184    * @return
 185    */
 186   List<EntryType> getEmblEntries(InputStream is)
 187   {
 188     List<EntryType> entries = new ArrayList<>();
 189     try
 190     {
 191       JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl");
 192       XMLStreamReader streamReader = XMLInputFactory.newInstance()
 193               .createXMLStreamReader(is);
 194       javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
 195       JAXBElement<ROOT> rootElement = um.unmarshal(streamReader,
 196               ROOT.class);
 197       ROOT root = rootElement.getValue();
 198
 199       /*
 200        * document root contains either "entry" or "entrySet"
 201        */
 202       if (root == null)
 203       {
 204         return entries;
 205       }
 206       if (root.getEntrySet() != null)
 207       {
 208         entries = root.getEntrySet().getEntry();
 209       }
 210       else if (root.getEntry() != null)
 211       {
 212         entries.add(root.getEntry());
 213       }
 214     } catch (JAXBException | XMLStreamException
 215             | FactoryConfigurationError e)
 216     {
 217       e.printStackTrace();
 218     }
 219     return entries;
 220   }
 221
 222   /**
 223    * A helper method to parse XML data and construct a sequence, with any
 224    * available database references and features
 225    *
 226    * @param emprefx
 227    * @param entry
 228    * @param peptides
 229    * @return
 230    */
 231   SequenceI getSequence(String sourceDb, EntryType entry,
 232           List<SequenceI> peptides)
 233   {
 234     String seqString = entry.getSequence();
 235     if (seqString == null)
 236     {
 237       return null;
 238     }
 239     seqString = seqString.replace(" ", "").replace("\n", "").replace("\t",
 240             "");
 241     String accession = entry.getAccession();
 242     SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString);
 243
 244     dna.setDescription(entry.getDescription());
 245     String sequenceVersion = String.valueOf(entry.getVersion().intValue());
 246     DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion,
 247             accession);
 248     dna.addDBRef(selfRref);
 249     selfRref.setMap(
 250             new Mapping(null, new int[]
 251             { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
 252                     1));
 253
 254     /*
 255      * add db references
 256      */
 257     List<XrefType> xrefs = entry.getXref();
 258     if (xrefs != null)
 259     {
 260       for (XrefType xref : xrefs)
 261       {
 262         String acc = xref.getId();
 263         String source = DBRefUtils.getCanonicalName(xref.getDb());
 264         String version = xref.getSecondaryId();
 265         if (version == null || "".equals(version))
 266         {
 267           version = "0";
 268         }
 269         dna.addDBRef(new DBRefEntry(source, version, acc));
 270       }
 271     }
 272
 273     SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
 274     try
 275     {
 276       List<Feature> features = entry.getFeature();
 277       if (features != null)
 278       {
 279         for (Feature feature : features)
 280         {
 281           if (FeatureProperties.isCodingFeature(sourceDb,
 282                   feature.getName()))
 283           {
 284             parseCodingFeature(entry, feature, sourceDb, dna, peptides,
 285                     matcher);
 286           }
 287         }
 288       }
 289     } catch (Exception e)
 290     {
 291       System.err.println("EMBL Record Features parsing error!");
 292       System.err
 293               .println("Please report the following to help@jalview.org :");
 294       System.err.println("EMBL Record " + accession);
 295       System.err.println("Resulted in exception: " + e.getMessage());
 296       e.printStackTrace(System.err);
 297     }
 298
 299     return dna;
 300   }
 301
 302   /**
 303    * Extracts coding region and product from a CDS feature and decorates it with
 304    * annotations
 305    *
 306    * @param entry
 307    * @param feature
 308    * @param sourceDb
 309    * @param dna
 310    * @param peptides
 311    * @param matcher
 312    */
 313   void parseCodingFeature(EntryType entry, Feature feature, String sourceDb,
 314           SequenceI dna, List<SequenceI> peptides,
 315           SequenceIdMatcher matcher)
 316   {
 317     final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
 318     final String accession = entry.getAccession();
 319     final String sequenceVersion = entry.getVersion().toString();
 320
 321     int[] exons = getCdsRanges(entry.getAccession(), feature);
 322
 323     String translation = null;
 324     String proteinName = "";
 325     String proteinId = null;
 326     Map<String, String> vals = new Hashtable<>();
 327
 328     /*
 329      * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
 330      * (phase is required for CDS features in GFF3 format)
 331      */
 332     int codonStart = 1;
 333
 334     /*
 335      * parse qualifiers, saving protein translation, protein id,
 336      * codon start position, product (name), and 'other values'
 337      */
 338     if (feature.getQualifier() != null)
 339     {
 340       for (Qualifier q : feature.getQualifier())
 341       {
 342         String qname = q.getName();
 343         String value = q.getValue();
 344         value = value == null ? ""
 345                 : value.trim().replace(" ", "").replace("\n", "")
 346                         .replace("\t", "");
 347         if (qname.equals("translation"))
 348         {
 349           translation = value;
 350         }
 351         else if (qname.equals("protein_id"))
 352         {
 353           proteinId = value;
 354         }
 355         else if (qname.equals("codon_start"))
 356         {
 357           try
 358           {
 359             codonStart = Integer.parseInt(value.trim());
 360           } catch (NumberFormatException e)
 361           {
 362             System.err.println("Invalid codon_start in XML for "
 363                     + entry.getAccession() + ": " + e.getMessage());
 364           }
 365         }
 366         else if (qname.equals("product"))
 367         {
 368           // sometimes name is returned e.g. for V00488
 369           proteinName = value;
 370         }
 371         else
 372         {
 373           // throw anything else into the additional properties hash
 374           if (!"".equals(value))
 375           {
 376             vals.put(qname, value);
 377           }
 378         }
 379       }
 380     }
 381
 382     DBRefEntry proteinToEmblProteinRef = null;
 383     exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
 384
 385     SequenceI product = null;
 386     Mapping dnaToProteinMapping = null;
 387     if (translation != null && proteinName != null && proteinId != null)
 388     {
 389       int translationLength = translation.length();
 390
 391       /*
 392        * look for product in peptides list, if not found, add it
 393        */
 394       product = matcher.findIdMatch(proteinId);
 395       if (product == null)
 396       {
 397         product = new Sequence(proteinId, translation, 1,
 398                 translationLength);
 399         product.setDescription(((proteinName.length() == 0)
 400                 ? "Protein Product from " + sourceDb
 401                 : proteinName));
 402         peptides.add(product);
 403         matcher.add(product);
 404       }
 405
 406       // we have everything - create the mapping and perhaps the protein
 407       // sequence
 408       if (exons == null || exons.length == 0)
 409       {
 410         /*
 411          * workaround until we handle dna location for CDS sequence
 412          * e.g. location="X53828.1:60..1058" correctly
 413          */
 414         System.err.println(
 415                 "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
 416                         + sourceDb + ":" + entry.getAccession() + ")");
 417         int dnaLength = dna.getLength();
 418         if (translationLength * 3 == (1 - codonStart + dnaLength))
 419         {
 420           System.err.println(
 421                   "Not allowing for additional stop codon at end of cDNA fragment... !");
 422           // this might occur for CDS sequences where no features are marked
 423           exons = new int[] { dna.getStart() + (codonStart - 1),
 424               dna.getEnd() };
 425           dnaToProteinMapping = new Mapping(product, exons,
 426                   new int[]
 427                   { 1, translationLength }, 3, 1);
 428         }
 429         if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
 430         {
 431           System.err.println(
 432                   "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
 433           exons = new int[] { dna.getStart() + (codonStart - 1),
 434               dna.getEnd() - 3 };
 435           dnaToProteinMapping = new Mapping(product, exons,
 436                   new int[]
 437                   { 1, translationLength }, 3, 1);
 438         }
 439       }
 440       else
 441       {
 442         // Trim the exon mapping if necessary - the given product may only be a
 443         // fragment of a larger protein. (EMBL:AY043181 is an example)
 444
 445         if (isEmblCdna)
 446         {
 447           // TODO: Add a DbRef back to the parent EMBL sequence with the exon
 448           // map
 449           // if given a dataset reference, search dataset for parent EMBL
 450           // sequence if it exists and set its map
 451           // make a new feature annotating the coding contig
 452         }
 453         else
 454         {
 455           // final product length truncation check
 456           int[] cdsRanges = adjustForProteinLength(translationLength,
 457                   exons);
 458           dnaToProteinMapping = new Mapping(product, cdsRanges,
 459                   new int[]
 460                   { 1, translationLength }, 3, 1);
 461           if (product != null)
 462           {
 463             /*
 464              * make xref with mapping from protein to EMBL dna
 465              */
 466             DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
 467                     sequenceVersion, proteinId,
 468                     new Mapping(dnaToProteinMapping.getMap().getInverse()));
 469             product.addDBRef(proteinToEmblRef);
 470
 471             /*
 472              * make xref from protein to EMBLCDS; we assume here that the
 473              * CDS sequence version is same as dna sequence (?!)
 474              */
 475             MapList proteinToCdsMapList = new MapList(
 476                     new int[]
 477                     { 1, translationLength },
 478                     new int[]
 479                     { 1 + (codonStart - 1),
 480                         (codonStart - 1) + 3 * translationLength },
 481                     1, 3);
 482             DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
 483                     DBRefSource.EMBLCDS, sequenceVersion, proteinId,
 484                     new Mapping(proteinToCdsMapList));
 485             product.addDBRef(proteinToEmblCdsRef);
 486
 487             /*
 488              * make 'direct' xref from protein to EMBLCDSPROTEIN
 489              */
 490             proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
 491             proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
 492             proteinToEmblProteinRef.setMap(null);
 493             product.addDBRef(proteinToEmblProteinRef);
 494           }
 495         }
 496       }
 497
 498       /*
 499        * add cds features to dna sequence
 500        */
 501       String cds = feature.getName(); // "CDS"
 502       for (int xint = 0; exons != null
 503               && xint < exons.length - 1; xint += 2)
 504       {
 505         int exonStart = exons[xint];
 506         int exonEnd = exons[xint + 1];
 507         int begin = Math.min(exonStart, exonEnd);
 508         int end = Math.max(exonStart, exonEnd);
 509         int exonNumber = xint / 2 + 1;
 510         String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
 511                 exonNumber, proteinName, proteinId);
 512
 513         SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb,
 514                 vals);
 515
 516         sf.setEnaLocation(feature.getLocation());
 517         boolean forwardStrand = exonStart <= exonEnd;
 518         sf.setStrand(forwardStrand ? "+" : "-");
 519         sf.setPhase(String.valueOf(codonStart - 1));
 520         sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 521         sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
 522
 523         dna.addSequenceFeature(sf);
 524       }
 525     }
 526
 527     /*
 528      * add feature dbRefs to sequence, and mappings for Uniprot xrefs
 529      */
 530     boolean hasUniprotDbref = false;
 531     List<XrefType> xrefs = feature.getXref();
 532     if (xrefs != null)
 533     {
 534       boolean mappingUsed = false;
 535       for (XrefType xref : xrefs)
 536       {
 537         /*
 538          * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 539          */
 540         String source = DBRefUtils.getCanonicalName(xref.getDb());
 541         String version = xref.getSecondaryId();
 542         if (version == null || "".equals(version))
 543         {
 544           version = "0";
 545         }
 546         DBRefEntry dbref = new DBRefEntry(source, version, xref.getId());
 547         DBRefEntry proteinDbRef = new DBRefEntry(source, version,
 548                 dbref.getAccessionId());
 549         if (source.equals(DBRefSource.UNIPROT))
 550         {
 551           String proteinSeqName = DBRefSource.UNIPROT + "|"
 552                   + dbref.getAccessionId();
 553           if (dnaToProteinMapping != null
 554                   && dnaToProteinMapping.getTo() != null)
 555           {
 556             if (mappingUsed)
 557             {
 558               /*
 559                * two or more Uniprot xrefs for the same CDS -
 560                * each needs a distinct Mapping (as to a different sequence)
 561                */
 562               dnaToProteinMapping = new Mapping(dnaToProteinMapping);
 563             }
 564             mappingUsed = true;
 565
 566             /*
 567              * try to locate the protein mapped to (possibly by a
 568              * previous CDS feature); if not found, construct it from
 569              * the EMBL translation
 570              */
 571             SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
 572             if (proteinSeq == null)
 573             {
 574               proteinSeq = new Sequence(proteinSeqName,
 575                       product.getSequenceAsString());
 576               matcher.add(proteinSeq);
 577               proteinSeq.setDescription(product.getDescription());
 578               peptides.add(proteinSeq);
 579             }
 580             dnaToProteinMapping.setTo(proteinSeq);
 581             dnaToProteinMapping.setMappedFromId(proteinId);
 582             proteinSeq.addDBRef(proteinDbRef);
 583             dbref.setMap(dnaToProteinMapping);
 584           }
 585           hasUniprotDbref = true;
 586         }
 587         if (product != null)
 588         {
 589           /*
 590            * copy feature dbref to our protein product
 591            */
 592           DBRefEntry pref = proteinDbRef;
 593           pref.setMap(null); // reference is direct
 594           product.addDBRef(pref);
 595           // Add converse mapping reference
 596           if (dnaToProteinMapping != null)
 597           {
 598             Mapping pmap = new Mapping(dna,
 599                     dnaToProteinMapping.getMap().getInverse());
 600             pref = new DBRefEntry(sourceDb, sequenceVersion, accession);
 601             pref.setMap(pmap);
 602             if (dnaToProteinMapping.getTo() != null)
 603             {
 604               dnaToProteinMapping.getTo().addDBRef(pref);
 605             }
 606           }
 607         }
 608         dna.addDBRef(dbref);
 609       }
 610     }
 611
 612     /*
 613      * if we have a product (translation) but no explicit Uniprot dbref
 614      * (example: EMBL AAFI02000057 protein_id EAL65544.1)
 615      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 616      */
 617     if (!hasUniprotDbref && product != null)
 618     {
 619       if (proteinToEmblProteinRef == null)
 620       {
 621         // assuming CDSPROTEIN sequence version = dna version (?!)
 622         proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 623                 sequenceVersion, proteinId);
 624       }
 625       product.addDBRef(proteinToEmblProteinRef);
 626
 627       if (dnaToProteinMapping != null
 628               && dnaToProteinMapping.getTo() != null)
 629       {
 630         DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 631                 DBRefSource.EMBLCDSProduct, sequenceVersion, proteinId);
 632         dnaToEmblProteinRef.setMap(dnaToProteinMapping);
 633         dnaToProteinMapping.setMappedFromId(proteinId);
 634         dna.addDBRef(dnaToEmblProteinRef);
 635       }
 636     }
 637   }
 638
 639   @Override
 640   public boolean isDnaCoding()
 641   {
 642     return true;
 643   }
 644
 645   /**
 646    * Returns the CDS positions as a single array of [start, end, start, end...]
 647    * positions. If on the reverse strand, these will be in descending order.
 648    *
 649    * @param accession
 650    * @param feature
 651    * @return
 652    */
 653   protected int[] getCdsRanges(String accession, Feature feature)
 654   {
 655     String location = feature.getLocation();
 656     if (location == null)
 657     {
 658       return new int[] {};
 659     }
 660
 661     try
 662     {
 663       List<int[]> ranges = DnaUtils.parseLocation(location);
 664       return listToArray(ranges);
 665     } catch (ParseException e)
 666     {
 667       Cache.log.warn(
 668               String.format("Not parsing inexact CDS location %s in ENA %s",
 669                       location, accession));
 670       return new int[] {};
 671     }
 672   }
 673
 674   /**
 675    * Converts a list of [start, end] ranges to a single array of [start, end,
 676    * start, end ...]
 677    *
 678    * @param ranges
 679    * @return
 680    */
 681   int[] listToArray(List<int[]> ranges)
 682   {
 683     int[] result = new int[ranges.size() * 2];
 684     int i = 0;
 685     for (int[] range : ranges)
 686     {
 687       result[i++] = range[0];
 688       result[i++] = range[1];
 689     }
 690     return result;
 691   }
 692
 693   /**
 694    * Helper method to construct a SequenceFeature for one cds range
 695    *
 696    * @param type
 697    *          feature type ("CDS")
 698    * @param desc
 699    *          description
 700    * @param begin
 701    *          start position
 702    * @param end
 703    *          end position
 704    * @param group
 705    *          feature group
 706    * @param vals
 707    *          map of 'miscellaneous values' for feature
 708    * @return
 709    */
 710   protected SequenceFeature makeCdsFeature(String type, String desc,
 711           int begin, int end, String group, Map<String, String> vals)
 712   {
 713     SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
 714     if (!vals.isEmpty())
 715     {
 716       for (Entry<String, String> val : vals.entrySet())
 717       {
 718         sf.setValue(val.getKey(), val.getValue());
 719       }
 720     }
 721     return sf;
 722   }
 723
 724   @Override
 725   public String getAccessionSeparator()
 726   {
 727     return null;
 728   }
 729
 730   @Override
 731   public Regex getAccessionValidator()
 732   {
 733     return ACCESSION_REGEX;
 734   }
 735
 736   @Override
 737   public String getDbVersion()
 738   {
 739     return "0";
 740   }
 741
 742   @Override
 743   public int getTier()
 744   {
 745     return 0;
 746   }
 747
 748   @Override
 749   public boolean isValidReference(String accession)
 750   {
 751     if (accession == null || accession.length() < 2)
 752     {
 753       return false;
 754     }
 755     return getAccessionValidator().search(accession);
 756   }
 757
 758   /**
 759    * Truncates (if necessary) the exon intervals to match 3 times the length of
 760    * the protein; also accepts 3 bases longer (for stop codon not included in
 761    * protein)
 762    *
 763    * @param proteinLength
 764    * @param exon
 765    *          an array of [start, end, start, end...] intervals
 766    * @return the same array (if unchanged) or a truncated copy
 767    */
 768   static int[] adjustForProteinLength(int proteinLength, int[] exon)
 769   {
 770     if (proteinLength <= 0 || exon == null)
 771     {
 772       return exon;
 773     }
 774     int expectedCdsLength = proteinLength * 3;
 775     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 776
 777     /*
 778      * if exon length matches protein, or is shorter, or longer by the
 779      * length of a stop codon (3 bases), then leave it unchanged
 780      */
 781     if (expectedCdsLength >= exonLength
 782             || expectedCdsLength == exonLength - 3)
 783     {
 784       return exon;
 785     }
 786
 787     int origxon[];
 788     int sxpos = -1;
 789     int endxon = 0;
 790     origxon = new int[exon.length];
 791     System.arraycopy(exon, 0, origxon, 0, exon.length);
 792     int cdspos = 0;
 793     for (int x = 0; x < exon.length; x += 2)
 794     {
 795       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 796       if (expectedCdsLength <= cdspos)
 797       {
 798         // advanced beyond last codon.
 799         sxpos = x;
 800         if (expectedCdsLength != cdspos)
 801         {
 802           // System.err
 803           // .println("Truncating final exon interval on region by "
 804           // + (cdspos - cdslength));
 805         }
 806
 807         /*
 808          * shrink the final exon - reduce end position if forward
 809          * strand, increase it if reverse
 810          */
 811         if (exon[x + 1] >= exon[x])
 812         {
 813           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 814         }
 815         else
 816         {
 817           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 818         }
 819         break;
 820       }
 821     }
 822
 823     if (sxpos != -1)
 824     {
 825       // and trim the exon interval set if necessary
 826       int[] nxon = new int[sxpos + 2];
 827       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 828       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 829                                 // set
 830       exon = nxon;
 831     }
 832     return exon;
 833   }
 834
 835 }