src/jalview/ws/dbsources/EmblXmlSource.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.ws.dbsources;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.bin.Cache;
  25 import jalview.datamodel.Alignment;
  26 import jalview.datamodel.AlignmentI;
  27 import jalview.datamodel.DBRefEntry;
  28 import jalview.datamodel.DBRefSource;
  29 import jalview.datamodel.FeatureProperties;
  30 import jalview.datamodel.Mapping;
  31 import jalview.datamodel.Sequence;
  32 import jalview.datamodel.SequenceFeature;
  33 import jalview.datamodel.SequenceI;
  34 import jalview.util.DBRefUtils;
  35 import jalview.util.DnaUtils;
  36 import jalview.util.MapList;
  37 import jalview.util.MappingUtils;
  38 import jalview.util.MessageManager;
  39 import jalview.ws.ebi.EBIFetchClient;
  40 import jalview.xml.binding.embl.EntryType;
  41 import jalview.xml.binding.embl.EntryType.Feature;
  42 import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
  43 import jalview.xml.binding.embl.XrefType;
  44
  45 import java.io.File;
  46 import java.io.FileInputStream;
  47 import java.io.InputStream;
  48 import java.text.ParseException;
  49 import java.util.ArrayList;
  50 import java.util.Arrays;
  51 import java.util.Hashtable;
  52 import java.util.List;
  53 import java.util.Map;
  54 import java.util.Map.Entry;
  55 import java.util.regex.Pattern;
  56
  57 import javax.xml.bind.JAXBContext;
  58 import javax.xml.bind.JAXBException;
  59 import javax.xml.stream.FactoryConfigurationError;
  60 import javax.xml.stream.XMLInputFactory;
  61 import javax.xml.stream.XMLStreamException;
  62 import javax.xml.stream.XMLStreamReader;
  63
  64 public abstract class EmblXmlSource extends EbiFileRetrievedProxy
  65 {
  66   /*
  67    * JAL-1856 Embl returns this text for query not found
  68    */
  69   private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found.";
  70
  71   private static final Pattern SPACE_PATTERN = Pattern.compile(" ");
  72
  73   public EmblXmlSource()
  74   {
  75     super();
  76   }
  77
  78   /**
  79    * Retrieves and parses an emblxml file, and returns an alignment containing
  80    * the parsed sequences, or null if none were found
  81    *
  82    * @param emprefx
  83    *          "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml
  84    * @param query
  85    * @return
  86    * @throws Exception
  87    */
  88   public AlignmentI getEmblSequenceRecords(String emprefx, String query)
  89           throws Exception
  90   {
  91     startQuery();
  92     EBIFetchClient dbFetch = new EBIFetchClient();
  93     File reply;
  94     try
  95     {
  96       reply = dbFetch.fetchDataAsFile(
  97               emprefx.toLowerCase() + ":" + query.trim(), "display=xml",
  98               "xml");
  99     } catch (Exception e)
 100     {
 101       stopQuery();
 102       throw new Exception(MessageManager.formatMessage(
 103               "exception.ebiembl_retrieval_failed_on", new String[]
 104               { emprefx.toLowerCase(), query.trim() }), e);
 105     }
 106     return getEmblSequenceRecords(emprefx, query, reply);
 107   }
 108
 109   /**
 110    * parse an emblxml file stored locally
 111    *
 112    * @param emprefx
 113    *          either EMBL or EMBLCDS strings are allowed - anything else will
 114    *          not retrieve emblxml
 115    * @param query
 116    * @param file
 117    *          the EMBL XML file containing the results of a query
 118    * @return
 119    * @throws Exception
 120    */
 121   public AlignmentI getEmblSequenceRecords(String emprefx, String query,
 122           File reply) throws Exception
 123   {
 124     List<EntryType> entries = null;
 125     if (reply != null && reply.exists())
 126     {
 127       file = reply.getAbsolutePath();
 128       if (reply.length() > EMBL_NOT_FOUND_REPLY.length())
 129       {
 130         InputStream is = new FileInputStream(reply);
 131         entries = getEmblEntries(is);
 132       }
 133     }
 134
 135     /*
 136      * invalid accession gets a reply with no <entry> elements, text content of
 137      * EmbFile reads something like (e.g.) this ungrammatical phrase
 138      * Entry: <acc> display type is either not supported or entry is not found.
 139      */
 140     AlignmentI al = null;
 141     List<SequenceI> seqs = new ArrayList<>();
 142     List<SequenceI> peptides = new ArrayList<>();
 143     if (entries != null)
 144     {
 145       for (EntryType entry : entries)
 146       {
 147         SequenceI seq = getSequence(emprefx, entry, peptides);
 148         if (seq != null)
 149         {
 150           seqs.add(seq.deriveSequence());
 151           // place DBReferences on dataset and refer
 152         }
 153       }
 154       if (!seqs.isEmpty())
 155       {
 156         al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
 157       }
 158       else
 159       {
 160         System.out.println(
 161                 "No record found for '" + emprefx + ":" + query + "'");
 162       }
 163     }
 164
 165     stopQuery();
 166     return al;
 167   }
 168
 169   /**
 170    * Reads the XML reply from file and unmarshals it to Java objects. Answers a
 171    * (possibly empty) list of <code>EntryType</code> objects.
 172    *
 173    * is
 174    *
 175    * @return
 176    */
 177   List<EntryType> getEmblEntries(InputStream is)
 178   {
 179     List<EntryType> entries = new ArrayList<>();
 180     try
 181     {
 182       JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl");
 183       XMLStreamReader streamReader = XMLInputFactory.newInstance()
 184               .createXMLStreamReader(is);
 185       javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
 186       jalview.xml.binding.embl.ROOT root = (jalview.xml.binding.embl.ROOT) um
 187               .unmarshal(streamReader);
 188
 189       /*
 190        * document root contains either "entry" or "entrySet"
 191        */
 192       if (root == null)
 193       {
 194         return entries;
 195       }
 196       if (root.getEntrySet() != null)
 197       {
 198         entries = root.getEntrySet().getEntry();
 199       }
 200       else if (root.getEntry() != null)
 201       {
 202         entries.add(root.getEntry());
 203       }
 204     } catch (JAXBException | XMLStreamException
 205             | FactoryConfigurationError e)
 206     {
 207       e.printStackTrace();
 208     }
 209     return entries;
 210   }
 211
 212   /**
 213    * A helper method to parse XML data and construct a sequence, with any
 214    * available database references and features
 215    *
 216    * @param emprefx
 217    * @param entry
 218    * @param peptides
 219    * @return
 220    */
 221   SequenceI getSequence(String sourceDb, EntryType entry,
 222           List<SequenceI> peptides)
 223   {
 224     String seqString = entry.getSequence();
 225     if (seqString == null)
 226     {
 227       return null;
 228     }
 229     seqString = seqString.replace(" ", "").replace("\n", "").replace("\t",
 230             "");
 231     String accession = entry.getAccession();
 232     SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString);
 233
 234     dna.setDescription(entry.getDescription());
 235     String sequenceVersion = String.valueOf(entry.getVersion().intValue());
 236     DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion,
 237             accession);
 238     dna.addDBRef(selfRref);
 239     selfRref.setMap(
 240             new Mapping(null, new int[]
 241             { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
 242                     1));
 243
 244     /*
 245      * add db references
 246      */
 247     List<XrefType> dbRefs = entry.getXref();
 248     if (dbRefs != null)
 249     {
 250       for (XrefType dbref : dbRefs)
 251       {
 252         String acc = dbref.getId();
 253         String source = DBRefUtils.getCanonicalName(dbref.getDb());
 254         String version = dbref.getSecondaryId();
 255         dna.addDBRef(new DBRefEntry(source, version, acc));
 256       }
 257     }
 258
 259     SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
 260     try
 261     {
 262       List<Feature> features = entry.getFeature();
 263       if (features != null)
 264       {
 265         for (Feature feature : features)
 266         {
 267           if (FeatureProperties.isCodingFeature(sourceDb,
 268                   feature.getName()))
 269           {
 270             parseCodingFeature(entry, feature, sourceDb, dna, peptides,
 271                     matcher);
 272           }
 273         }
 274       }
 275     } catch (Exception e)
 276     {
 277       System.err.println("EMBL Record Features parsing error!");
 278       System.err
 279               .println("Please report the following to help@jalview.org :");
 280       System.err.println("EMBL Record " + accession);
 281       System.err.println("Resulted in exception: " + e.getMessage());
 282       e.printStackTrace(System.err);
 283     }
 284
 285     return dna;
 286   }
 287
 288   /**
 289    * Extracts coding region and product from a CDS feature and decorates it with
 290    * annotations
 291    *
 292    * @param entry
 293    * @param feature
 294    * @param sourceDb
 295    * @param dna
 296    * @param peptides
 297    * @param matcher
 298    */
 299   void parseCodingFeature(EntryType entry, Feature feature, String sourceDb,
 300           SequenceI dna, List<SequenceI> peptides,
 301           SequenceIdMatcher matcher)
 302   {
 303     final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
 304     final String accession = entry.getAccession();
 305     final String sequenceVersion = entry.getVersion().toString();
 306
 307     int[] exons = getCdsRanges(entry.getAccession(), feature);
 308
 309     String translation = null;
 310     String proteinName = "";
 311     String proteinId = null;
 312     Map<String, String> vals = new Hashtable<>();
 313
 314     /*
 315      * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
 316      * (phase is required for CDS features in GFF3 format)
 317      */
 318     int codonStart = 1;
 319
 320     /*
 321      * parse qualifiers, saving protein translation, protein id,
 322      * codon start position, product (name), and 'other values'
 323      */
 324     if (feature.getQualifier() != null)
 325     {
 326       for (Qualifier q : feature.getQualifier())
 327       {
 328         String qname = q.getName();
 329         String value = q.getValue();
 330         value = value == null ? ""
 331                 : value.trim().replace(" ", "").replace("\n", "")
 332                         .replace("\t", "");
 333         if (qname.equals("translation"))
 334         {
 335           translation = value;
 336         }
 337         else if (qname.equals("protein_id"))
 338         {
 339           proteinId = value;
 340         }
 341         else if (qname.equals("codon_start"))
 342         {
 343           try
 344           {
 345             codonStart = Integer.parseInt(value.trim());
 346           } catch (NumberFormatException e)
 347           {
 348             System.err.println("Invalid codon_start in XML for "
 349                     + entry.getAccession() + ": " + e.getMessage());
 350           }
 351         }
 352         else if (qname.equals("product"))
 353         {
 354           // sometimes name is returned e.g. for V00488
 355           proteinName = value;
 356         }
 357         else
 358         {
 359           // throw anything else into the additional properties hash
 360           if (!"".equals(value))
 361           {
 362             vals.put(qname, value);
 363           }
 364         }
 365       }
 366     }
 367
 368     DBRefEntry proteinToEmblProteinRef = null;
 369     exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
 370
 371     SequenceI product = null;
 372     Mapping dnaToProteinMapping = null;
 373     if (translation != null && proteinName != null && proteinId != null)
 374     {
 375       int translationLength = translation.length();
 376
 377       /*
 378        * look for product in peptides list, if not found, add it
 379        */
 380       product = matcher.findIdMatch(proteinId);
 381       if (product == null)
 382       {
 383         product = new Sequence(proteinId, translation, 1,
 384                 translationLength);
 385         product.setDescription(((proteinName.length() == 0)
 386                 ? "Protein Product from " + sourceDb
 387                 : proteinName));
 388         peptides.add(product);
 389         matcher.add(product);
 390       }
 391
 392       // we have everything - create the mapping and perhaps the protein
 393       // sequence
 394       if (exons == null || exons.length == 0)
 395       {
 396         /*
 397          * workaround until we handle dna location for CDS sequence
 398          * e.g. location="X53828.1:60..1058" correctly
 399          */
 400         System.err.println(
 401                 "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
 402                         + sourceDb + ":" + entry.getAccession() + ")");
 403         int dnaLength = dna.getLength();
 404         if (translationLength * 3 == (1 - codonStart + dnaLength))
 405         {
 406           System.err.println(
 407                   "Not allowing for additional stop codon at end of cDNA fragment... !");
 408           // this might occur for CDS sequences where no features are marked
 409           exons = new int[] { dna.getStart() + (codonStart - 1),
 410               dna.getEnd() };
 411           dnaToProteinMapping = new Mapping(product, exons,
 412                   new int[]
 413                   { 1, translationLength }, 3, 1);
 414         }
 415         if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
 416         {
 417           System.err.println(
 418                   "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
 419           exons = new int[] { dna.getStart() + (codonStart - 1),
 420               dna.getEnd() - 3 };
 421           dnaToProteinMapping = new Mapping(product, exons,
 422                   new int[]
 423                   { 1, translationLength }, 3, 1);
 424         }
 425       }
 426       else
 427       {
 428         // Trim the exon mapping if necessary - the given product may only be a
 429         // fragment of a larger protein. (EMBL:AY043181 is an example)
 430
 431         if (isEmblCdna)
 432         {
 433           // TODO: Add a DbRef back to the parent EMBL sequence with the exon
 434           // map
 435           // if given a dataset reference, search dataset for parent EMBL
 436           // sequence if it exists and set its map
 437           // make a new feature annotating the coding contig
 438         }
 439         else
 440         {
 441           // final product length truncation check
 442           int[] cdsRanges = adjustForProteinLength(translationLength,
 443                   exons);
 444           dnaToProteinMapping = new Mapping(product, cdsRanges,
 445                   new int[]
 446                   { 1, translationLength }, 3, 1);
 447           if (product != null)
 448           {
 449             /*
 450              * make xref with mapping from protein to EMBL dna
 451              */
 452             DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
 453                     sequenceVersion, proteinId,
 454                     new Mapping(dnaToProteinMapping.getMap().getInverse()));
 455             product.addDBRef(proteinToEmblRef);
 456
 457             /*
 458              * make xref from protein to EMBLCDS; we assume here that the
 459              * CDS sequence version is same as dna sequence (?!)
 460              */
 461             MapList proteinToCdsMapList = new MapList(
 462                     new int[]
 463                     { 1, translationLength },
 464                     new int[]
 465                     { 1 + (codonStart - 1),
 466                         (codonStart - 1) + 3 * translationLength },
 467                     1, 3);
 468             DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
 469                     DBRefSource.EMBLCDS, sequenceVersion, proteinId,
 470                     new Mapping(proteinToCdsMapList));
 471             product.addDBRef(proteinToEmblCdsRef);
 472
 473             /*
 474              * make 'direct' xref from protein to EMBLCDSPROTEIN
 475              */
 476             proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
 477             proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
 478             proteinToEmblProteinRef.setMap(null);
 479             product.addDBRef(proteinToEmblProteinRef);
 480           }
 481         }
 482       }
 483
 484       /*
 485        * add cds features to dna sequence
 486        */
 487       String cds = feature.getName(); // "CDS"
 488       for (int xint = 0; exons != null
 489               && xint < exons.length - 1; xint += 2)
 490       {
 491         int exonStart = exons[xint];
 492         int exonEnd = exons[xint + 1];
 493         int begin = Math.min(exonStart, exonEnd);
 494         int end = Math.max(exonStart, exonEnd);
 495         int exonNumber = xint / 2 + 1;
 496         String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
 497                 exonNumber, proteinName, proteinId);
 498
 499         SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb,
 500                 vals);
 501
 502         sf.setEnaLocation(feature.getLocation());
 503         boolean forwardStrand = exonStart <= exonEnd;
 504         sf.setStrand(forwardStrand ? "+" : "-");
 505         sf.setPhase(String.valueOf(codonStart - 1));
 506         sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 507         sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
 508
 509         dna.addSequenceFeature(sf);
 510       }
 511     }
 512
 513     /*
 514      * add feature dbRefs to sequence, and mappings for Uniprot xrefs
 515      */
 516     boolean hasUniprotDbref = false;
 517     List<XrefType> xrefs = feature.getXref();
 518     if (xrefs != null)
 519     {
 520       boolean mappingUsed = false;
 521       for (XrefType xref : xrefs)
 522       {
 523         /*
 524          * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 525          */
 526         String source = DBRefUtils.getCanonicalName(xref.getDb());
 527         DBRefEntry dbref = new DBRefEntry(source, xref.getSecondaryId(),
 528                 xref.getId());
 529         DBRefEntry proteinDbRef = new DBRefEntry(dbref.getSource(),
 530                 dbref.getVersion(), dbref.getAccessionId());
 531         if (source.equals(DBRefSource.UNIPROT))
 532         {
 533           String proteinSeqName = DBRefSource.UNIPROT + "|"
 534                   + dbref.getAccessionId();
 535           if (dnaToProteinMapping != null
 536                   && dnaToProteinMapping.getTo() != null)
 537           {
 538             if (mappingUsed)
 539             {
 540               /*
 541                * two or more Uniprot xrefs for the same CDS -
 542                * each needs a distinct Mapping (as to a different sequence)
 543                */
 544               dnaToProteinMapping = new Mapping(dnaToProteinMapping);
 545             }
 546             mappingUsed = true;
 547
 548             /*
 549              * try to locate the protein mapped to (possibly by a
 550              * previous CDS feature); if not found, construct it from
 551              * the EMBL translation
 552              */
 553             SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
 554             if (proteinSeq == null)
 555             {
 556               proteinSeq = new Sequence(proteinSeqName,
 557                       product.getSequenceAsString());
 558               matcher.add(proteinSeq);
 559               peptides.add(proteinSeq);
 560             }
 561             dnaToProteinMapping.setTo(proteinSeq);
 562             dnaToProteinMapping.setMappedFromId(proteinId);
 563             proteinSeq.addDBRef(proteinDbRef);
 564             dbref.setMap(dnaToProteinMapping);
 565           }
 566           hasUniprotDbref = true;
 567         }
 568         if (product != null)
 569         {
 570           /*
 571            * copy feature dbref to our protein product
 572            */
 573           DBRefEntry pref = proteinDbRef;
 574           pref.setMap(null); // reference is direct
 575           product.addDBRef(pref);
 576           // Add converse mapping reference
 577           if (dnaToProteinMapping != null)
 578           {
 579             Mapping pmap = new Mapping(dna,
 580                     dnaToProteinMapping.getMap().getInverse());
 581             pref = new DBRefEntry(sourceDb, sequenceVersion, accession);
 582             pref.setMap(pmap);
 583             if (dnaToProteinMapping.getTo() != null)
 584             {
 585               dnaToProteinMapping.getTo().addDBRef(pref);
 586             }
 587           }
 588         }
 589         dna.addDBRef(dbref);
 590       }
 591     }
 592
 593     /*
 594      * if we have a product (translation) but no explicit Uniprot dbref
 595      * (example: EMBL AAFI02000057 protein_id EAL65544.1)
 596      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 597      */
 598     if (!hasUniprotDbref && product != null)
 599     {
 600       if (proteinToEmblProteinRef == null)
 601       {
 602         // assuming CDSPROTEIN sequence version = dna version (?!)
 603         proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 604                 sequenceVersion, proteinId);
 605       }
 606       product.addDBRef(proteinToEmblProteinRef);
 607
 608       if (dnaToProteinMapping != null
 609               && dnaToProteinMapping.getTo() != null)
 610       {
 611         DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 612                 DBRefSource.EMBLCDSProduct, sequenceVersion,
 613                 proteinId);
 614         dnaToEmblProteinRef.setMap(dnaToProteinMapping);
 615         dnaToProteinMapping.setMappedFromId(proteinId);
 616         dna.addDBRef(dnaToEmblProteinRef);
 617       }
 618     }
 619   }
 620
 621   @Override
 622   public boolean isDnaCoding()
 623   {
 624     return true;
 625   }
 626
 627   /**
 628    * Returns the CDS positions as a single array of [start, end, start, end...]
 629    * positions. If on the reverse strand, these will be in descending order.
 630    *
 631    * @param accession
 632    * @param feature
 633    * @return
 634    */
 635   protected int[] getCdsRanges(String accession, Feature feature)
 636   {
 637     String location = feature.getLocation();
 638     if (location == null)
 639     {
 640       return new int[] {};
 641     }
 642
 643     try
 644     {
 645       List<int[]> ranges = DnaUtils.parseLocation(location);
 646       return listToArray(ranges);
 647     } catch (ParseException e)
 648     {
 649       Cache.log.warn(
 650               String.format("Not parsing inexact CDS location %s in ENA %s",
 651                       location, accession));
 652       return new int[] {};
 653     }
 654   }
 655
 656   /**
 657    * Converts a list of [start, end] ranges to a single array of [start, end,
 658    * start, end ...]
 659    *
 660    * @param ranges
 661    * @return
 662    */
 663   int[] listToArray(List<int[]> ranges)
 664   {
 665     int[] result = new int[ranges.size() * 2];
 666     int i = 0;
 667     for (int[] range : ranges)
 668     {
 669       result[i++] = range[0];
 670       result[i++] = range[1];
 671     }
 672     return result;
 673   }
 674
 675   /**
 676    * Helper method to construct a SequenceFeature for one cds range
 677    *
 678    * @param type
 679    *          feature type ("CDS")
 680    * @param desc
 681    *          description
 682    * @param begin
 683    *          start position
 684    * @param end
 685    *          end position
 686    * @param group
 687    *          feature group
 688    * @param vals
 689    *          map of 'miscellaneous values' for feature
 690    * @return
 691    */
 692   protected SequenceFeature makeCdsFeature(String type, String desc,
 693           int begin, int end, String group, Map<String, String> vals)
 694   {
 695     SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
 696     if (!vals.isEmpty())
 697     {
 698       StringBuilder sb = new StringBuilder();
 699       boolean first = true;
 700       for (Entry<String, String> val : vals.entrySet())
 701       {
 702         if (!first)
 703         {
 704           sb.append(";");
 705         }
 706         sb.append(val.getKey()).append("=").append(val.getValue());
 707         first = false;
 708         sf.setValue(val.getKey(), val.getValue());
 709       }
 710       sf.setAttributes(sb.toString());
 711     }
 712     return sf;
 713   }
 714
 715   /**
 716    * Truncates (if necessary) the exon intervals to match 3 times the length of
 717    * the protein; also accepts 3 bases longer (for stop codon not included in
 718    * protein)
 719    *
 720    * @param proteinLength
 721    * @param exon
 722    *          an array of [start, end, start, end...] intervals
 723    * @return the same array (if unchanged) or a truncated copy
 724    */
 725   static int[] adjustForProteinLength(int proteinLength, int[] exon)
 726   {
 727     if (proteinLength <= 0 || exon == null)
 728     {
 729       return exon;
 730     }
 731     int expectedCdsLength = proteinLength * 3;
 732     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 733
 734     /*
 735      * if exon length matches protein, or is shorter, or longer by the
 736      * length of a stop codon (3 bases), then leave it unchanged
 737      */
 738     if (expectedCdsLength >= exonLength
 739             || expectedCdsLength == exonLength - 3)
 740     {
 741       return exon;
 742     }
 743
 744     int origxon[];
 745     int sxpos = -1;
 746     int endxon = 0;
 747     origxon = new int[exon.length];
 748     System.arraycopy(exon, 0, origxon, 0, exon.length);
 749     int cdspos = 0;
 750     for (int x = 0; x < exon.length; x += 2)
 751     {
 752       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 753       if (expectedCdsLength <= cdspos)
 754       {
 755         // advanced beyond last codon.
 756         sxpos = x;
 757         if (expectedCdsLength != cdspos)
 758         {
 759           // System.err
 760           // .println("Truncating final exon interval on region by "
 761           // + (cdspos - cdslength));
 762         }
 763
 764         /*
 765          * shrink the final exon - reduce end position if forward
 766          * strand, increase it if reverse
 767          */
 768         if (exon[x + 1] >= exon[x])
 769         {
 770           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 771         }
 772         else
 773         {
 774           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 775         }
 776         break;
 777       }
 778     }
 779
 780     if (sxpos != -1)
 781     {
 782       // and trim the exon interval set if necessary
 783       int[] nxon = new int[sxpos + 2];
 784       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 785       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 786                                 // set
 787       exon = nxon;
 788     }
 789     return exon;
 790   }
 791
 792 }