src/jalview/ws/dbsources/EmblXmlSource.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.ws.dbsources;
  22
  23 import java.io.File;
  24 import java.io.FileInputStream;
  25 import java.io.InputStream;
  26 import java.text.ParseException;
  27 import java.util.ArrayList;
  28 import java.util.Arrays;
  29 import java.util.Hashtable;
  30 import java.util.List;
  31 import java.util.Map;
  32 import java.util.Map.Entry;
  33
  34 import javax.xml.bind.JAXBContext;
  35 import javax.xml.bind.JAXBElement;
  36 import javax.xml.bind.JAXBException;
  37 import javax.xml.stream.FactoryConfigurationError;
  38 import javax.xml.stream.XMLInputFactory;
  39 import javax.xml.stream.XMLStreamException;
  40 import javax.xml.stream.XMLStreamReader;
  41
  42 import jalview.analysis.SequenceIdMatcher;
  43 import jalview.bin.Cache;
  44 import jalview.datamodel.Alignment;
  45 import jalview.datamodel.AlignmentI;
  46 import jalview.datamodel.DBRefEntry;
  47 import jalview.datamodel.DBRefSource;
  48 import jalview.datamodel.FeatureProperties;
  49 import jalview.datamodel.Mapping;
  50 import jalview.datamodel.Sequence;
  51 import jalview.datamodel.SequenceFeature;
  52 import jalview.datamodel.SequenceI;
  53 import jalview.util.DBRefUtils;
  54 import jalview.util.DnaUtils;
  55 import jalview.util.MapList;
  56 import jalview.util.MappingUtils;
  57 import jalview.util.MessageManager;
  58 import jalview.ws.ebi.EBIFetchClient;
  59 import jalview.xml.binding.embl.EntryType;
  60 import jalview.xml.binding.embl.EntryType.Feature;
  61 import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
  62 import jalview.xml.binding.embl.ROOT;
  63 import jalview.xml.binding.embl.XrefType;
  64
  65 public abstract class EmblXmlSource extends EbiFileRetrievedProxy
  66 {
  67   /*
  68    * JAL-1856 Embl returns this text for query not found
  69    */
  70   private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found.";
  71
  72   public EmblXmlSource()
  73   {
  74     super();
  75   }
  76
  77   /**
  78    * Retrieves and parses an emblxml file, and returns an alignment containing
  79    * the parsed sequences, or null if none were found
  80    *
  81    * @param emprefx
  82    *          "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml
  83    * @param query
  84    * @return
  85    * @throws Exception
  86    */
  87   protected AlignmentI getEmblSequenceRecords(String emprefx, String query)
  88           throws Exception
  89   {
  90     startQuery();
  91     EBIFetchClient dbFetch = new EBIFetchClient();
  92     File reply;
  93     try
  94     {
  95       reply = dbFetch.fetchDataAsFile(
  96               emprefx.toLowerCase() + ":" + query.trim(), "display=xml",
  97               "xml");
  98     } catch (Exception e)
  99     {
 100       stopQuery();
 101       throw new Exception(MessageManager.formatMessage(
 102               "exception.ebiembl_retrieval_failed_on", new String[]
 103               { emprefx.toLowerCase(), query.trim() }), e);
 104     }
 105     return getEmblSequenceRecords(emprefx, query, reply);
 106   }
 107
 108   /**
 109    * parse an emblxml file stored locally
 110    *
 111    * @param emprefx
 112    *          either EMBL or EMBLCDS strings are allowed - anything else will
 113    *          not retrieve emblxml
 114    * @param query
 115    * @param file
 116    *          the EMBL XML file containing the results of a query
 117    * @return
 118    * @throws Exception
 119    */
 120   protected AlignmentI getEmblSequenceRecords(String emprefx, String query,
 121           File reply) throws Exception
 122   {
 123     List<EntryType> entries = null;
 124     if (reply != null && reply.exists())
 125     {
 126       file = reply.getAbsolutePath();
 127       if (reply.length() > EMBL_NOT_FOUND_REPLY.length())
 128       {
 129         InputStream is = new FileInputStream(reply);
 130         entries = getEmblEntries(is);
 131       }
 132     }
 133
 134     /*
 135      * invalid accession gets a reply with no <entry> elements, text content of
 136      * EmbFile reads something like (e.g.) this ungrammatical phrase
 137      * Entry: <acc> display type is either not supported or entry is not found.
 138      */
 139     AlignmentI al = null;
 140     List<SequenceI> seqs = new ArrayList<>();
 141     List<SequenceI> peptides = new ArrayList<>();
 142     if (entries != null)
 143     {
 144       for (EntryType entry : entries)
 145       {
 146         SequenceI seq = getSequence(emprefx, entry, peptides);
 147         if (seq != null)
 148         {
 149           seqs.add(seq.deriveSequence());
 150           // place DBReferences on dataset and refer
 151         }
 152       }
 153       if (!seqs.isEmpty())
 154       {
 155         al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
 156       }
 157       else
 158       {
 159         System.out.println(
 160                 "No record found for '" + emprefx + ":" + query + "'");
 161       }
 162     }
 163
 164     stopQuery();
 165     return al;
 166   }
 167
 168   /**
 169    * Reads the XML reply from file and unmarshals it to Java objects. Answers a
 170    * (possibly empty) list of <code>EntryType</code> objects.
 171    *
 172    * is
 173    *
 174    * @return
 175    */
 176   List<EntryType> getEmblEntries(InputStream is)
 177   {
 178     List<EntryType> entries = new ArrayList<>();
 179     try
 180     {
 181       JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl");
 182       XMLStreamReader streamReader = XMLInputFactory.newInstance()
 183               .createXMLStreamReader(is);
 184       javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
 185       JAXBElement<ROOT> rootElement =  um.unmarshal(streamReader, ROOT.class);
 186       ROOT root = rootElement.getValue();
 187
 188       /*
 189        * document root contains either "entry" or "entrySet"
 190        */
 191       if (root == null)
 192       {
 193         return entries;
 194       }
 195       if (root.getEntrySet() != null)
 196       {
 197         entries = root.getEntrySet().getEntry();
 198       }
 199       else if (root.getEntry() != null)
 200       {
 201         entries.add(root.getEntry());
 202       }
 203     } catch (JAXBException | XMLStreamException
 204             | FactoryConfigurationError e)
 205     {
 206       e.printStackTrace();
 207     }
 208     return entries;
 209   }
 210
 211   /**
 212    * A helper method to parse XML data and construct a sequence, with any
 213    * available database references and features
 214    *
 215    * @param emprefx
 216    * @param entry
 217    * @param peptides
 218    * @return
 219    */
 220   SequenceI getSequence(String sourceDb, EntryType entry,
 221           List<SequenceI> peptides)
 222   {
 223     String seqString = entry.getSequence();
 224     if (seqString == null)
 225     {
 226       return null;
 227     }
 228     seqString = seqString.replace(" ", "").replace("\n", "").replace("\t",
 229             "");
 230     String accession = entry.getAccession();
 231     SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString);
 232
 233     dna.setDescription(entry.getDescription());
 234     String sequenceVersion = String.valueOf(entry.getVersion().intValue());
 235     DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion,
 236             accession);
 237     dna.addDBRef(selfRref);
 238     selfRref.setMap(
 239             new Mapping(null, new int[]
 240             { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
 241                     1));
 242
 243     /*
 244      * add db references
 245      */
 246     List<XrefType> xrefs = entry.getXref();
 247     if (xrefs != null)
 248     {
 249       for (XrefType xref : xrefs)
 250       {
 251         String acc = xref.getId();
 252         String source = DBRefUtils.getCanonicalName(xref.getDb());
 253         String version = xref.getSecondaryId();
 254         if (version == null || "".equals(version))
 255         {
 256           version = "0";
 257         }
 258         dna.addDBRef(new DBRefEntry(source, version, acc));
 259       }
 260     }
 261
 262     SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
 263     try
 264     {
 265       List<Feature> features = entry.getFeature();
 266       if (features != null)
 267       {
 268         for (Feature feature : features)
 269         {
 270           if (FeatureProperties.isCodingFeature(sourceDb,
 271                   feature.getName()))
 272           {
 273             parseCodingFeature(entry, feature, sourceDb, dna, peptides,
 274                     matcher);
 275           }
 276         }
 277       }
 278     } catch (Exception e)
 279     {
 280       System.err.println("EMBL Record Features parsing error!");
 281       System.err
 282               .println("Please report the following to help@jalview.org :");
 283       System.err.println("EMBL Record " + accession);
 284       System.err.println("Resulted in exception: " + e.getMessage());
 285       e.printStackTrace(System.err);
 286     }
 287
 288     return dna;
 289   }
 290
 291   /**
 292    * Extracts coding region and product from a CDS feature and decorates it with
 293    * annotations
 294    *
 295    * @param entry
 296    * @param feature
 297    * @param sourceDb
 298    * @param dna
 299    * @param peptides
 300    * @param matcher
 301    */
 302   void parseCodingFeature(EntryType entry, Feature feature, String sourceDb,
 303           SequenceI dna, List<SequenceI> peptides,
 304           SequenceIdMatcher matcher)
 305   {
 306     final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
 307     final String accession = entry.getAccession();
 308     final String sequenceVersion = entry.getVersion().toString();
 309
 310     int[] exons = getCdsRanges(entry.getAccession(), feature);
 311
 312     String translation = null;
 313     String proteinName = "";
 314     String proteinId = null;
 315     Map<String, String> vals = new Hashtable<>();
 316
 317     /*
 318      * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
 319      * (phase is required for CDS features in GFF3 format)
 320      */
 321     int codonStart = 1;
 322
 323     /*
 324      * parse qualifiers, saving protein translation, protein id,
 325      * codon start position, product (name), and 'other values'
 326      */
 327     if (feature.getQualifier() != null)
 328     {
 329       for (Qualifier q : feature.getQualifier())
 330       {
 331         String qname = q.getName();
 332         String value = q.getValue();
 333         value = value == null ? ""
 334                 : value.trim().replace(" ", "").replace("\n", "")
 335                         .replace("\t", "");
 336         if (qname.equals("translation"))
 337         {
 338           translation = value;
 339         }
 340         else if (qname.equals("protein_id"))
 341         {
 342           proteinId = value;
 343         }
 344         else if (qname.equals("codon_start"))
 345         {
 346           try
 347           {
 348             codonStart = Integer.parseInt(value.trim());
 349           } catch (NumberFormatException e)
 350           {
 351             System.err.println("Invalid codon_start in XML for "
 352                     + entry.getAccession() + ": " + e.getMessage());
 353           }
 354         }
 355         else if (qname.equals("product"))
 356         {
 357           // sometimes name is returned e.g. for V00488
 358           proteinName = value;
 359         }
 360         else
 361         {
 362           // throw anything else into the additional properties hash
 363           if (!"".equals(value))
 364           {
 365             vals.put(qname, value);
 366           }
 367         }
 368       }
 369     }
 370
 371     DBRefEntry proteinToEmblProteinRef = null;
 372     exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
 373
 374     SequenceI product = null;
 375     Mapping dnaToProteinMapping = null;
 376     if (translation != null && proteinName != null && proteinId != null)
 377     {
 378       int translationLength = translation.length();
 379
 380       /*
 381        * look for product in peptides list, if not found, add it
 382        */
 383       product = matcher.findIdMatch(proteinId);
 384       if (product == null)
 385       {
 386         product = new Sequence(proteinId, translation, 1,
 387                 translationLength);
 388         product.setDescription(((proteinName.length() == 0)
 389                 ? "Protein Product from " + sourceDb
 390                 : proteinName));
 391         peptides.add(product);
 392         matcher.add(product);
 393       }
 394
 395       // we have everything - create the mapping and perhaps the protein
 396       // sequence
 397       if (exons == null || exons.length == 0)
 398       {
 399         /*
 400          * workaround until we handle dna location for CDS sequence
 401          * e.g. location="X53828.1:60..1058" correctly
 402          */
 403         System.err.println(
 404                 "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
 405                         + sourceDb + ":" + entry.getAccession() + ")");
 406         int dnaLength = dna.getLength();
 407         if (translationLength * 3 == (1 - codonStart + dnaLength))
 408         {
 409           System.err.println(
 410                   "Not allowing for additional stop codon at end of cDNA fragment... !");
 411           // this might occur for CDS sequences where no features are marked
 412           exons = new int[] { dna.getStart() + (codonStart - 1),
 413               dna.getEnd() };
 414           dnaToProteinMapping = new Mapping(product, exons,
 415                   new int[]
 416                   { 1, translationLength }, 3, 1);
 417         }
 418         if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
 419         {
 420           System.err.println(
 421                   "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
 422           exons = new int[] { dna.getStart() + (codonStart - 1),
 423               dna.getEnd() - 3 };
 424           dnaToProteinMapping = new Mapping(product, exons,
 425                   new int[]
 426                   { 1, translationLength }, 3, 1);
 427         }
 428       }
 429       else
 430       {
 431         // Trim the exon mapping if necessary - the given product may only be a
 432         // fragment of a larger protein. (EMBL:AY043181 is an example)
 433
 434         if (isEmblCdna)
 435         {
 436           // TODO: Add a DbRef back to the parent EMBL sequence with the exon
 437           // map
 438           // if given a dataset reference, search dataset for parent EMBL
 439           // sequence if it exists and set its map
 440           // make a new feature annotating the coding contig
 441         }
 442         else
 443         {
 444           // final product length truncation check
 445           int[] cdsRanges = adjustForProteinLength(translationLength,
 446                   exons);
 447           dnaToProteinMapping = new Mapping(product, cdsRanges,
 448                   new int[]
 449                   { 1, translationLength }, 3, 1);
 450           if (product != null)
 451           {
 452             /*
 453              * make xref with mapping from protein to EMBL dna
 454              */
 455             DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
 456                     sequenceVersion, proteinId,
 457                     new Mapping(dnaToProteinMapping.getMap().getInverse()));
 458             product.addDBRef(proteinToEmblRef);
 459
 460             /*
 461              * make xref from protein to EMBLCDS; we assume here that the
 462              * CDS sequence version is same as dna sequence (?!)
 463              */
 464             MapList proteinToCdsMapList = new MapList(
 465                     new int[]
 466                     { 1, translationLength },
 467                     new int[]
 468                     { 1 + (codonStart - 1),
 469                         (codonStart - 1) + 3 * translationLength },
 470                     1, 3);
 471             DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
 472                     DBRefSource.EMBLCDS, sequenceVersion, proteinId,
 473                     new Mapping(proteinToCdsMapList));
 474             product.addDBRef(proteinToEmblCdsRef);
 475
 476             /*
 477              * make 'direct' xref from protein to EMBLCDSPROTEIN
 478              */
 479             proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
 480             proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
 481             proteinToEmblProteinRef.setMap(null);
 482             product.addDBRef(proteinToEmblProteinRef);
 483           }
 484         }
 485       }
 486
 487       /*
 488        * add cds features to dna sequence
 489        */
 490       String cds = feature.getName(); // "CDS"
 491       for (int xint = 0; exons != null
 492               && xint < exons.length - 1; xint += 2)
 493       {
 494         int exonStart = exons[xint];
 495         int exonEnd = exons[xint + 1];
 496         int begin = Math.min(exonStart, exonEnd);
 497         int end = Math.max(exonStart, exonEnd);
 498         int exonNumber = xint / 2 + 1;
 499         String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
 500                 exonNumber, proteinName, proteinId);
 501
 502         SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb,
 503                 vals);
 504
 505         sf.setEnaLocation(feature.getLocation());
 506         boolean forwardStrand = exonStart <= exonEnd;
 507         sf.setStrand(forwardStrand ? "+" : "-");
 508         sf.setPhase(String.valueOf(codonStart - 1));
 509         sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 510         sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
 511
 512         dna.addSequenceFeature(sf);
 513       }
 514     }
 515
 516     /*
 517      * add feature dbRefs to sequence, and mappings for Uniprot xrefs
 518      */
 519     boolean hasUniprotDbref = false;
 520     List<XrefType> xrefs = feature.getXref();
 521     if (xrefs != null)
 522     {
 523       boolean mappingUsed = false;
 524       for (XrefType xref : xrefs)
 525       {
 526         /*
 527          * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 528          */
 529         String source = DBRefUtils.getCanonicalName(xref.getDb());
 530         String version = xref.getSecondaryId();
 531         if (version == null || "".equals(version))
 532         {
 533           version = "0";
 534         }
 535         DBRefEntry dbref = new DBRefEntry(source, version, xref.getId());
 536         DBRefEntry proteinDbRef = new DBRefEntry(source, version,
 537                 dbref.getAccessionId());
 538         if (source.equals(DBRefSource.UNIPROT))
 539         {
 540           String proteinSeqName = DBRefSource.UNIPROT + "|"
 541                   + dbref.getAccessionId();
 542           if (dnaToProteinMapping != null
 543                   && dnaToProteinMapping.getTo() != null)
 544           {
 545             if (mappingUsed)
 546             {
 547               /*
 548                * two or more Uniprot xrefs for the same CDS -
 549                * each needs a distinct Mapping (as to a different sequence)
 550                */
 551               dnaToProteinMapping = new Mapping(dnaToProteinMapping);
 552             }
 553             mappingUsed = true;
 554
 555             /*
 556              * try to locate the protein mapped to (possibly by a
 557              * previous CDS feature); if not found, construct it from
 558              * the EMBL translation
 559              */
 560             SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
 561             if (proteinSeq == null)
 562             {
 563               proteinSeq = new Sequence(proteinSeqName,
 564                       product.getSequenceAsString());
 565               matcher.add(proteinSeq);
 566               peptides.add(proteinSeq);
 567             }
 568             dnaToProteinMapping.setTo(proteinSeq);
 569             dnaToProteinMapping.setMappedFromId(proteinId);
 570             proteinSeq.addDBRef(proteinDbRef);
 571             dbref.setMap(dnaToProteinMapping);
 572           }
 573           hasUniprotDbref = true;
 574         }
 575         if (product != null)
 576         {
 577           /*
 578            * copy feature dbref to our protein product
 579            */
 580           DBRefEntry pref = proteinDbRef;
 581           pref.setMap(null); // reference is direct
 582           product.addDBRef(pref);
 583           // Add converse mapping reference
 584           if (dnaToProteinMapping != null)
 585           {
 586             Mapping pmap = new Mapping(dna,
 587                     dnaToProteinMapping.getMap().getInverse());
 588             pref = new DBRefEntry(sourceDb, sequenceVersion, accession);
 589             pref.setMap(pmap);
 590             if (dnaToProteinMapping.getTo() != null)
 591             {
 592               dnaToProteinMapping.getTo().addDBRef(pref);
 593             }
 594           }
 595         }
 596         dna.addDBRef(dbref);
 597       }
 598     }
 599
 600     /*
 601      * if we have a product (translation) but no explicit Uniprot dbref
 602      * (example: EMBL AAFI02000057 protein_id EAL65544.1)
 603      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 604      */
 605     if (!hasUniprotDbref && product != null)
 606     {
 607       if (proteinToEmblProteinRef == null)
 608       {
 609         // assuming CDSPROTEIN sequence version = dna version (?!)
 610         proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 611                 sequenceVersion, proteinId);
 612       }
 613       product.addDBRef(proteinToEmblProteinRef);
 614
 615       if (dnaToProteinMapping != null
 616               && dnaToProteinMapping.getTo() != null)
 617       {
 618         DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 619                 DBRefSource.EMBLCDSProduct, sequenceVersion,
 620                 proteinId);
 621         dnaToEmblProteinRef.setMap(dnaToProteinMapping);
 622         dnaToProteinMapping.setMappedFromId(proteinId);
 623         dna.addDBRef(dnaToEmblProteinRef);
 624       }
 625     }
 626   }
 627
 628   @Override
 629   public boolean isDnaCoding()
 630   {
 631     return true;
 632   }
 633
 634   /**
 635    * Returns the CDS positions as a single array of [start, end, start, end...]
 636    * positions. If on the reverse strand, these will be in descending order.
 637    *
 638    * @param accession
 639    * @param feature
 640    * @return
 641    */
 642   protected int[] getCdsRanges(String accession, Feature feature)
 643   {
 644     String location = feature.getLocation();
 645     if (location == null)
 646     {
 647       return new int[] {};
 648     }
 649
 650     try
 651     {
 652       List<int[]> ranges = DnaUtils.parseLocation(location);
 653       return listToArray(ranges);
 654     } catch (ParseException e)
 655     {
 656       Cache.log.warn(
 657               String.format("Not parsing inexact CDS location %s in ENA %s",
 658                       location, accession));
 659       return new int[] {};
 660     }
 661   }
 662
 663   /**
 664    * Converts a list of [start, end] ranges to a single array of [start, end,
 665    * start, end ...]
 666    *
 667    * @param ranges
 668    * @return
 669    */
 670   int[] listToArray(List<int[]> ranges)
 671   {
 672     int[] result = new int[ranges.size() * 2];
 673     int i = 0;
 674     for (int[] range : ranges)
 675     {
 676       result[i++] = range[0];
 677       result[i++] = range[1];
 678     }
 679     return result;
 680   }
 681
 682   /**
 683    * Helper method to construct a SequenceFeature for one cds range
 684    *
 685    * @param type
 686    *          feature type ("CDS")
 687    * @param desc
 688    *          description
 689    * @param begin
 690    *          start position
 691    * @param end
 692    *          end position
 693    * @param group
 694    *          feature group
 695    * @param vals
 696    *          map of 'miscellaneous values' for feature
 697    * @return
 698    */
 699   protected SequenceFeature makeCdsFeature(String type, String desc,
 700           int begin, int end, String group, Map<String, String> vals)
 701   {
 702     SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
 703     if (!vals.isEmpty())
 704     {
 705       for (Entry<String, String> val : vals.entrySet())
 706       {
 707         sf.setValue(val.getKey(), val.getValue());
 708       }
 709     }
 710     return sf;
 711   }
 712
 713   /**
 714    * Truncates (if necessary) the exon intervals to match 3 times the length of
 715    * the protein; also accepts 3 bases longer (for stop codon not included in
 716    * protein)
 717    *
 718    * @param proteinLength
 719    * @param exon
 720    *          an array of [start, end, start, end...] intervals
 721    * @return the same array (if unchanged) or a truncated copy
 722    */
 723   static int[] adjustForProteinLength(int proteinLength, int[] exon)
 724   {
 725     if (proteinLength <= 0 || exon == null)
 726     {
 727       return exon;
 728     }
 729     int expectedCdsLength = proteinLength * 3;
 730     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 731
 732     /*
 733      * if exon length matches protein, or is shorter, or longer by the
 734      * length of a stop codon (3 bases), then leave it unchanged
 735      */
 736     if (expectedCdsLength >= exonLength
 737             || expectedCdsLength == exonLength - 3)
 738     {
 739       return exon;
 740     }
 741
 742     int origxon[];
 743     int sxpos = -1;
 744     int endxon = 0;
 745     origxon = new int[exon.length];
 746     System.arraycopy(exon, 0, origxon, 0, exon.length);
 747     int cdspos = 0;
 748     for (int x = 0; x < exon.length; x += 2)
 749     {
 750       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 751       if (expectedCdsLength <= cdspos)
 752       {
 753         // advanced beyond last codon.
 754         sxpos = x;
 755         if (expectedCdsLength != cdspos)
 756         {
 757           // System.err
 758           // .println("Truncating final exon interval on region by "
 759           // + (cdspos - cdslength));
 760         }
 761
 762         /*
 763          * shrink the final exon - reduce end position if forward
 764          * strand, increase it if reverse
 765          */
 766         if (exon[x + 1] >= exon[x])
 767         {
 768           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 769         }
 770         else
 771         {
 772           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 773         }
 774         break;
 775       }
 776     }
 777
 778     if (sxpos != -1)
 779     {
 780       // and trim the exon interval set if necessary
 781       int[] nxon = new int[sxpos + 2];
 782       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 783       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 784                                 // set
 785       exon = nxon;
 786     }
 787     return exon;
 788   }
 789
 790 }