src/jalview/ws/dbsources/EmblXmlSource.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.ws.dbsources;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.bin.Cache;
  25 import jalview.datamodel.Alignment;
  26 import jalview.datamodel.AlignmentI;
  27 import jalview.datamodel.DBRefEntry;
  28 import jalview.datamodel.DBRefSource;
  29 import jalview.datamodel.FeatureProperties;
  30 import jalview.datamodel.Mapping;
  31 import jalview.datamodel.Sequence;
  32 import jalview.datamodel.SequenceFeature;
  33 import jalview.datamodel.SequenceI;
  34 import jalview.util.DBRefUtils;
  35 import jalview.util.DnaUtils;
  36 import jalview.util.MapList;
  37 import jalview.util.MappingUtils;
  38 import jalview.util.MessageManager;
  39 import jalview.ws.ebi.EBIFetchClient;
  40 import jalview.xml.binding.embl.EntryType;
  41 import jalview.xml.binding.embl.EntryType.Feature;
  42 import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
  43 import jalview.xml.binding.jalview.JalviewModel;
  44 import jalview.xml.binding.embl.ROOT;
  45 import jalview.xml.binding.embl.XrefType;
  46
  47 import java.io.File;
  48 import java.io.FileInputStream;
  49 import java.io.InputStream;
  50 import java.text.ParseException;
  51 import java.util.ArrayList;
  52 import java.util.Arrays;
  53 import java.util.Hashtable;
  54 import java.util.List;
  55 import java.util.Map;
  56 import java.util.Map.Entry;
  57 import java.util.regex.Pattern;
  58
  59 import javax.xml.bind.JAXBContext;
  60 import javax.xml.bind.JAXBElement;
  61 import javax.xml.bind.JAXBException;
  62 import javax.xml.stream.FactoryConfigurationError;
  63 import javax.xml.stream.XMLInputFactory;
  64 import javax.xml.stream.XMLStreamException;
  65 import javax.xml.stream.XMLStreamReader;
  66
  67 public abstract class EmblXmlSource extends EbiFileRetrievedProxy
  68 {
  69   /*
  70    * JAL-1856 Embl returns this text for query not found
  71    */
  72   private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found.";
  73
  74   private static final Pattern SPACE_PATTERN = Pattern.compile(" ");
  75
  76   public EmblXmlSource()
  77   {
  78     super();
  79   }
  80
  81   /**
  82    * Retrieves and parses an emblxml file, and returns an alignment containing
  83    * the parsed sequences, or null if none were found
  84    *
  85    * @param emprefx
  86    *          "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml
  87    * @param query
  88    * @return
  89    * @throws Exception
  90    */
  91   protected AlignmentI getEmblSequenceRecords(String emprefx, String query)
  92           throws Exception
  93   {
  94     startQuery();
  95     EBIFetchClient dbFetch = new EBIFetchClient();
  96     File reply;
  97     try
  98     {
  99       reply = dbFetch.fetchDataAsFile(
 100               emprefx.toLowerCase() + ":" + query.trim(), "display=xml",
 101               "xml");
 102     } catch (Exception e)
 103     {
 104       stopQuery();
 105       throw new Exception(MessageManager.formatMessage(
 106               "exception.ebiembl_retrieval_failed_on", new String[]
 107               { emprefx.toLowerCase(), query.trim() }), e);
 108     }
 109     return getEmblSequenceRecords(emprefx, query, reply);
 110   }
 111
 112   /**
 113    * parse an emblxml file stored locally
 114    *
 115    * @param emprefx
 116    *          either EMBL or EMBLCDS strings are allowed - anything else will
 117    *          not retrieve emblxml
 118    * @param query
 119    * @param file
 120    *          the EMBL XML file containing the results of a query
 121    * @return
 122    * @throws Exception
 123    */
 124   protected AlignmentI getEmblSequenceRecords(String emprefx, String query,
 125           File reply) throws Exception
 126   {
 127     List<EntryType> entries = null;
 128     if (reply != null && reply.exists())
 129     {
 130       file = reply.getAbsolutePath();
 131       if (reply.length() > EMBL_NOT_FOUND_REPLY.length())
 132       {
 133         InputStream is = new FileInputStream(reply);
 134         entries = getEmblEntries(is);
 135       }
 136     }
 137
 138     /*
 139      * invalid accession gets a reply with no <entry> elements, text content of
 140      * EmbFile reads something like (e.g.) this ungrammatical phrase
 141      * Entry: <acc> display type is either not supported or entry is not found.
 142      */
 143     AlignmentI al = null;
 144     List<SequenceI> seqs = new ArrayList<>();
 145     List<SequenceI> peptides = new ArrayList<>();
 146     if (entries != null)
 147     {
 148       for (EntryType entry : entries)
 149       {
 150         SequenceI seq = getSequence(emprefx, entry, peptides);
 151         if (seq != null)
 152         {
 153           seqs.add(seq.deriveSequence());
 154           // place DBReferences on dataset and refer
 155         }
 156       }
 157       if (!seqs.isEmpty())
 158       {
 159         al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
 160       }
 161       else
 162       {
 163         System.out.println(
 164                 "No record found for '" + emprefx + ":" + query + "'");
 165       }
 166     }
 167
 168     stopQuery();
 169     return al;
 170   }
 171
 172   /**
 173    * Reads the XML reply from file and unmarshals it to Java objects. Answers a
 174    * (possibly empty) list of <code>EntryType</code> objects.
 175    *
 176    * is
 177    *
 178    * @return
 179    */
 180   List<EntryType> getEmblEntries(InputStream is)
 181   {
 182     List<EntryType> entries = new ArrayList<>();
 183     try
 184     {
 185       JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl");
 186       XMLStreamReader streamReader = XMLInputFactory.newInstance()
 187               .createXMLStreamReader(is);
 188       javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
 189       JAXBElement<ROOT> rootElement =  um.unmarshal(streamReader, ROOT.class);
 190       ROOT root = rootElement.getValue();
 191
 192       /*
 193        * document root contains either "entry" or "entrySet"
 194        */
 195       if (root == null)
 196       {
 197         return entries;
 198       }
 199       if (root.getEntrySet() != null)
 200       {
 201         entries = root.getEntrySet().getEntry();
 202       }
 203       else if (root.getEntry() != null)
 204       {
 205         entries.add(root.getEntry());
 206       }
 207     } catch (JAXBException | XMLStreamException
 208             | FactoryConfigurationError e)
 209     {
 210       e.printStackTrace();
 211     }
 212     return entries;
 213   }
 214
 215   /**
 216    * A helper method to parse XML data and construct a sequence, with any
 217    * available database references and features
 218    *
 219    * @param emprefx
 220    * @param entry
 221    * @param peptides
 222    * @return
 223    */
 224   SequenceI getSequence(String sourceDb, EntryType entry,
 225           List<SequenceI> peptides)
 226   {
 227     String seqString = entry.getSequence();
 228     if (seqString == null)
 229     {
 230       return null;
 231     }
 232     seqString = seqString.replace(" ", "").replace("\n", "").replace("\t",
 233             "");
 234     String accession = entry.getAccession();
 235     SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString);
 236
 237     dna.setDescription(entry.getDescription());
 238     String sequenceVersion = String.valueOf(entry.getVersion().intValue());
 239     DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion,
 240             accession);
 241     dna.addDBRef(selfRref);
 242     selfRref.setMap(
 243             new Mapping(null, new int[]
 244             { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
 245                     1));
 246
 247     /*
 248      * add db references
 249      */
 250     List<XrefType> xrefs = entry.getXref();
 251     if (xrefs != null)
 252     {
 253       for (XrefType xref : xrefs)
 254       {
 255         String acc = xref.getId();
 256         String source = DBRefUtils.getCanonicalName(xref.getDb());
 257         String version = xref.getSecondaryId();
 258         if (version == null || "".equals(version))
 259         {
 260           version = "0";
 261         }
 262         dna.addDBRef(new DBRefEntry(source, version, acc));
 263       }
 264     }
 265
 266     SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
 267     try
 268     {
 269       List<Feature> features = entry.getFeature();
 270       if (features != null)
 271       {
 272         for (Feature feature : features)
 273         {
 274           if (FeatureProperties.isCodingFeature(sourceDb,
 275                   feature.getName()))
 276           {
 277             parseCodingFeature(entry, feature, sourceDb, dna, peptides,
 278                     matcher);
 279           }
 280         }
 281       }
 282     } catch (Exception e)
 283     {
 284       System.err.println("EMBL Record Features parsing error!");
 285       System.err
 286               .println("Please report the following to help@jalview.org :");
 287       System.err.println("EMBL Record " + accession);
 288       System.err.println("Resulted in exception: " + e.getMessage());
 289       e.printStackTrace(System.err);
 290     }
 291
 292     return dna;
 293   }
 294
 295   /**
 296    * Extracts coding region and product from a CDS feature and decorates it with
 297    * annotations
 298    *
 299    * @param entry
 300    * @param feature
 301    * @param sourceDb
 302    * @param dna
 303    * @param peptides
 304    * @param matcher
 305    */
 306   void parseCodingFeature(EntryType entry, Feature feature, String sourceDb,
 307           SequenceI dna, List<SequenceI> peptides,
 308           SequenceIdMatcher matcher)
 309   {
 310     final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
 311     final String accession = entry.getAccession();
 312     final String sequenceVersion = entry.getVersion().toString();
 313
 314     int[] exons = getCdsRanges(entry.getAccession(), feature);
 315
 316     String translation = null;
 317     String proteinName = "";
 318     String proteinId = null;
 319     Map<String, String> vals = new Hashtable<>();
 320
 321     /*
 322      * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
 323      * (phase is required for CDS features in GFF3 format)
 324      */
 325     int codonStart = 1;
 326
 327     /*
 328      * parse qualifiers, saving protein translation, protein id,
 329      * codon start position, product (name), and 'other values'
 330      */
 331     if (feature.getQualifier() != null)
 332     {
 333       for (Qualifier q : feature.getQualifier())
 334       {
 335         String qname = q.getName();
 336         String value = q.getValue();
 337         value = value == null ? ""
 338                 : value.trim().replace(" ", "").replace("\n", "")
 339                         .replace("\t", "");
 340         if (qname.equals("translation"))
 341         {
 342           translation = value;
 343         }
 344         else if (qname.equals("protein_id"))
 345         {
 346           proteinId = value;
 347         }
 348         else if (qname.equals("codon_start"))
 349         {
 350           try
 351           {
 352             codonStart = Integer.parseInt(value.trim());
 353           } catch (NumberFormatException e)
 354           {
 355             System.err.println("Invalid codon_start in XML for "
 356                     + entry.getAccession() + ": " + e.getMessage());
 357           }
 358         }
 359         else if (qname.equals("product"))
 360         {
 361           // sometimes name is returned e.g. for V00488
 362           proteinName = value;
 363         }
 364         else
 365         {
 366           // throw anything else into the additional properties hash
 367           if (!"".equals(value))
 368           {
 369             vals.put(qname, value);
 370           }
 371         }
 372       }
 373     }
 374
 375     DBRefEntry proteinToEmblProteinRef = null;
 376     exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
 377
 378     SequenceI product = null;
 379     Mapping dnaToProteinMapping = null;
 380     if (translation != null && proteinName != null && proteinId != null)
 381     {
 382       int translationLength = translation.length();
 383
 384       /*
 385        * look for product in peptides list, if not found, add it
 386        */
 387       product = matcher.findIdMatch(proteinId);
 388       if (product == null)
 389       {
 390         product = new Sequence(proteinId, translation, 1,
 391                 translationLength);
 392         product.setDescription(((proteinName.length() == 0)
 393                 ? "Protein Product from " + sourceDb
 394                 : proteinName));
 395         peptides.add(product);
 396         matcher.add(product);
 397       }
 398
 399       // we have everything - create the mapping and perhaps the protein
 400       // sequence
 401       if (exons == null || exons.length == 0)
 402       {
 403         /*
 404          * workaround until we handle dna location for CDS sequence
 405          * e.g. location="X53828.1:60..1058" correctly
 406          */
 407         System.err.println(
 408                 "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
 409                         + sourceDb + ":" + entry.getAccession() + ")");
 410         int dnaLength = dna.getLength();
 411         if (translationLength * 3 == (1 - codonStart + dnaLength))
 412         {
 413           System.err.println(
 414                   "Not allowing for additional stop codon at end of cDNA fragment... !");
 415           // this might occur for CDS sequences where no features are marked
 416           exons = new int[] { dna.getStart() + (codonStart - 1),
 417               dna.getEnd() };
 418           dnaToProteinMapping = new Mapping(product, exons,
 419                   new int[]
 420                   { 1, translationLength }, 3, 1);
 421         }
 422         if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
 423         {
 424           System.err.println(
 425                   "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
 426           exons = new int[] { dna.getStart() + (codonStart - 1),
 427               dna.getEnd() - 3 };
 428           dnaToProteinMapping = new Mapping(product, exons,
 429                   new int[]
 430                   { 1, translationLength }, 3, 1);
 431         }
 432       }
 433       else
 434       {
 435         // Trim the exon mapping if necessary - the given product may only be a
 436         // fragment of a larger protein. (EMBL:AY043181 is an example)
 437
 438         if (isEmblCdna)
 439         {
 440           // TODO: Add a DbRef back to the parent EMBL sequence with the exon
 441           // map
 442           // if given a dataset reference, search dataset for parent EMBL
 443           // sequence if it exists and set its map
 444           // make a new feature annotating the coding contig
 445         }
 446         else
 447         {
 448           // final product length truncation check
 449           int[] cdsRanges = adjustForProteinLength(translationLength,
 450                   exons);
 451           dnaToProteinMapping = new Mapping(product, cdsRanges,
 452                   new int[]
 453                   { 1, translationLength }, 3, 1);
 454           if (product != null)
 455           {
 456             /*
 457              * make xref with mapping from protein to EMBL dna
 458              */
 459             DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
 460                     sequenceVersion, proteinId,
 461                     new Mapping(dnaToProteinMapping.getMap().getInverse()));
 462             product.addDBRef(proteinToEmblRef);
 463
 464             /*
 465              * make xref from protein to EMBLCDS; we assume here that the
 466              * CDS sequence version is same as dna sequence (?!)
 467              */
 468             MapList proteinToCdsMapList = new MapList(
 469                     new int[]
 470                     { 1, translationLength },
 471                     new int[]
 472                     { 1 + (codonStart - 1),
 473                         (codonStart - 1) + 3 * translationLength },
 474                     1, 3);
 475             DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
 476                     DBRefSource.EMBLCDS, sequenceVersion, proteinId,
 477                     new Mapping(proteinToCdsMapList));
 478             product.addDBRef(proteinToEmblCdsRef);
 479
 480             /*
 481              * make 'direct' xref from protein to EMBLCDSPROTEIN
 482              */
 483             proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
 484             proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
 485             proteinToEmblProteinRef.setMap(null);
 486             product.addDBRef(proteinToEmblProteinRef);
 487           }
 488         }
 489       }
 490
 491       /*
 492        * add cds features to dna sequence
 493        */
 494       String cds = feature.getName(); // "CDS"
 495       for (int xint = 0; exons != null
 496               && xint < exons.length - 1; xint += 2)
 497       {
 498         int exonStart = exons[xint];
 499         int exonEnd = exons[xint + 1];
 500         int begin = Math.min(exonStart, exonEnd);
 501         int end = Math.max(exonStart, exonEnd);
 502         int exonNumber = xint / 2 + 1;
 503         String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
 504                 exonNumber, proteinName, proteinId);
 505
 506         SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb,
 507                 vals);
 508
 509         sf.setEnaLocation(feature.getLocation());
 510         boolean forwardStrand = exonStart <= exonEnd;
 511         sf.setStrand(forwardStrand ? "+" : "-");
 512         sf.setPhase(String.valueOf(codonStart - 1));
 513         sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 514         sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
 515
 516         dna.addSequenceFeature(sf);
 517       }
 518     }
 519
 520     /*
 521      * add feature dbRefs to sequence, and mappings for Uniprot xrefs
 522      */
 523     boolean hasUniprotDbref = false;
 524     List<XrefType> xrefs = feature.getXref();
 525     if (xrefs != null)
 526     {
 527       boolean mappingUsed = false;
 528       for (XrefType xref : xrefs)
 529       {
 530         /*
 531          * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 532          */
 533         String source = DBRefUtils.getCanonicalName(xref.getDb());
 534         String version = xref.getSecondaryId();
 535         if (version == null || "".equals(version))
 536         {
 537           version = "0";
 538         }
 539         DBRefEntry dbref = new DBRefEntry(source, version, xref.getId());
 540         DBRefEntry proteinDbRef = new DBRefEntry(source, version,
 541                 dbref.getAccessionId());
 542         if (source.equals(DBRefSource.UNIPROT))
 543         {
 544           String proteinSeqName = DBRefSource.UNIPROT + "|"
 545                   + dbref.getAccessionId();
 546           if (dnaToProteinMapping != null
 547                   && dnaToProteinMapping.getTo() != null)
 548           {
 549             if (mappingUsed)
 550             {
 551               /*
 552                * two or more Uniprot xrefs for the same CDS -
 553                * each needs a distinct Mapping (as to a different sequence)
 554                */
 555               dnaToProteinMapping = new Mapping(dnaToProteinMapping);
 556             }
 557             mappingUsed = true;
 558
 559             /*
 560              * try to locate the protein mapped to (possibly by a
 561              * previous CDS feature); if not found, construct it from
 562              * the EMBL translation
 563              */
 564             SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
 565             if (proteinSeq == null)
 566             {
 567               proteinSeq = new Sequence(proteinSeqName,
 568                       product.getSequenceAsString());
 569               matcher.add(proteinSeq);
 570               peptides.add(proteinSeq);
 571             }
 572             dnaToProteinMapping.setTo(proteinSeq);
 573             dnaToProteinMapping.setMappedFromId(proteinId);
 574             proteinSeq.addDBRef(proteinDbRef);
 575             dbref.setMap(dnaToProteinMapping);
 576           }
 577           hasUniprotDbref = true;
 578         }
 579         if (product != null)
 580         {
 581           /*
 582            * copy feature dbref to our protein product
 583            */
 584           DBRefEntry pref = proteinDbRef;
 585           pref.setMap(null); // reference is direct
 586           product.addDBRef(pref);
 587           // Add converse mapping reference
 588           if (dnaToProteinMapping != null)
 589           {
 590             Mapping pmap = new Mapping(dna,
 591                     dnaToProteinMapping.getMap().getInverse());
 592             pref = new DBRefEntry(sourceDb, sequenceVersion, accession);
 593             pref.setMap(pmap);
 594             if (dnaToProteinMapping.getTo() != null)
 595             {
 596               dnaToProteinMapping.getTo().addDBRef(pref);
 597             }
 598           }
 599         }
 600         dna.addDBRef(dbref);
 601       }
 602     }
 603
 604     /*
 605      * if we have a product (translation) but no explicit Uniprot dbref
 606      * (example: EMBL AAFI02000057 protein_id EAL65544.1)
 607      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 608      */
 609     if (!hasUniprotDbref && product != null)
 610     {
 611       if (proteinToEmblProteinRef == null)
 612       {
 613         // assuming CDSPROTEIN sequence version = dna version (?!)
 614         proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 615                 sequenceVersion, proteinId);
 616       }
 617       product.addDBRef(proteinToEmblProteinRef);
 618
 619       if (dnaToProteinMapping != null
 620               && dnaToProteinMapping.getTo() != null)
 621       {
 622         DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 623                 DBRefSource.EMBLCDSProduct, sequenceVersion,
 624                 proteinId);
 625         dnaToEmblProteinRef.setMap(dnaToProteinMapping);
 626         dnaToProteinMapping.setMappedFromId(proteinId);
 627         dna.addDBRef(dnaToEmblProteinRef);
 628       }
 629     }
 630   }
 631
 632   @Override
 633   public boolean isDnaCoding()
 634   {
 635     return true;
 636   }
 637
 638   /**
 639    * Returns the CDS positions as a single array of [start, end, start, end...]
 640    * positions. If on the reverse strand, these will be in descending order.
 641    *
 642    * @param accession
 643    * @param feature
 644    * @return
 645    */
 646   protected int[] getCdsRanges(String accession, Feature feature)
 647   {
 648     String location = feature.getLocation();
 649     if (location == null)
 650     {
 651       return new int[] {};
 652     }
 653
 654     try
 655     {
 656       List<int[]> ranges = DnaUtils.parseLocation(location);
 657       return listToArray(ranges);
 658     } catch (ParseException e)
 659     {
 660       Cache.log.warn(
 661               String.format("Not parsing inexact CDS location %s in ENA %s",
 662                       location, accession));
 663       return new int[] {};
 664     }
 665   }
 666
 667   /**
 668    * Converts a list of [start, end] ranges to a single array of [start, end,
 669    * start, end ...]
 670    *
 671    * @param ranges
 672    * @return
 673    */
 674   int[] listToArray(List<int[]> ranges)
 675   {
 676     int[] result = new int[ranges.size() * 2];
 677     int i = 0;
 678     for (int[] range : ranges)
 679     {
 680       result[i++] = range[0];
 681       result[i++] = range[1];
 682     }
 683     return result;
 684   }
 685
 686   /**
 687    * Helper method to construct a SequenceFeature for one cds range
 688    *
 689    * @param type
 690    *          feature type ("CDS")
 691    * @param desc
 692    *          description
 693    * @param begin
 694    *          start position
 695    * @param end
 696    *          end position
 697    * @param group
 698    *          feature group
 699    * @param vals
 700    *          map of 'miscellaneous values' for feature
 701    * @return
 702    */
 703   protected SequenceFeature makeCdsFeature(String type, String desc,
 704           int begin, int end, String group, Map<String, String> vals)
 705   {
 706     SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
 707     if (!vals.isEmpty())
 708     {
 709       StringBuilder sb = new StringBuilder();
 710       boolean first = true;
 711       for (Entry<String, String> val : vals.entrySet())
 712       {
 713         if (!first)
 714         {
 715           sb.append(";");
 716         }
 717         sb.append(val.getKey()).append("=").append(val.getValue());
 718         first = false;
 719         sf.setValue(val.getKey(), val.getValue());
 720       }
 721       sf.setAttributes(sb.toString());
 722     }
 723     return sf;
 724   }
 725
 726   /**
 727    * Truncates (if necessary) the exon intervals to match 3 times the length of
 728    * the protein; also accepts 3 bases longer (for stop codon not included in
 729    * protein)
 730    *
 731    * @param proteinLength
 732    * @param exon
 733    *          an array of [start, end, start, end...] intervals
 734    * @return the same array (if unchanged) or a truncated copy
 735    */
 736   static int[] adjustForProteinLength(int proteinLength, int[] exon)
 737   {
 738     if (proteinLength <= 0 || exon == null)
 739     {
 740       return exon;
 741     }
 742     int expectedCdsLength = proteinLength * 3;
 743     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 744
 745     /*
 746      * if exon length matches protein, or is shorter, or longer by the
 747      * length of a stop codon (3 bases), then leave it unchanged
 748      */
 749     if (expectedCdsLength >= exonLength
 750             || expectedCdsLength == exonLength - 3)
 751     {
 752       return exon;
 753     }
 754
 755     int origxon[];
 756     int sxpos = -1;
 757     int endxon = 0;
 758     origxon = new int[exon.length];
 759     System.arraycopy(exon, 0, origxon, 0, exon.length);
 760     int cdspos = 0;
 761     for (int x = 0; x < exon.length; x += 2)
 762     {
 763       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 764       if (expectedCdsLength <= cdspos)
 765       {
 766         // advanced beyond last codon.
 767         sxpos = x;
 768         if (expectedCdsLength != cdspos)
 769         {
 770           // System.err
 771           // .println("Truncating final exon interval on region by "
 772           // + (cdspos - cdslength));
 773         }
 774
 775         /*
 776          * shrink the final exon - reduce end position if forward
 777          * strand, increase it if reverse
 778          */
 779         if (exon[x + 1] >= exon[x])
 780         {
 781           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 782         }
 783         else
 784         {
 785           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 786         }
 787         break;
 788       }
 789     }
 790
 791     if (sxpos != -1)
 792     {
 793       // and trim the exon interval set if necessary
 794       int[] nxon = new int[sxpos + 2];
 795       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 796       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 797                                 // set
 798       exon = nxon;
 799     }
 800     return exon;
 801   }
 802
 803 }