src/jalview/io/EmblFlatFile.java

   1 package jalview.io;
   2
   3 import java.io.IOException;
   4 import java.text.ParseException;
   5 import java.util.ArrayList;
   6 import java.util.Arrays;
   7 import java.util.HashMap;
   8 import java.util.Hashtable;
   9 import java.util.List;
  10 import java.util.Map;
  11 import java.util.Map.Entry;
  12 import java.util.TreeMap;
  13
  14 import jalview.bin.Cache;
  15 import jalview.datamodel.DBRefEntry;
  16 import jalview.datamodel.DBRefSource;
  17 import jalview.datamodel.FeatureProperties;
  18 import jalview.datamodel.Mapping;
  19 import jalview.datamodel.Sequence;
  20 import jalview.datamodel.SequenceFeature;
  21 import jalview.datamodel.SequenceI;
  22 import jalview.util.DBRefUtils;
  23 import jalview.util.DnaUtils;
  24 import jalview.util.MapList;
  25 import jalview.util.MappingUtils;
  26
  27 /**
  28  * A class that provides selective parsing of the EMBL flatfile format.
  29  * <p>
  30  * The initial implementation is limited to extracting fields used by Jalview
  31  * after fetching an EMBL or EMBLCDS entry:
  32  *
  33  * <pre>
  34  * accession, version, sequence, xref
  35  * and (for CDS feature) location, protein_id, product, codon_start, translation
  36  * </pre>
  37  *
  38  * For a complete parser, it may be best to adopt that provided in
  39  * https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile
  40  * (but note this has a dependency on the Apache Commons library)
  41  *
  42  * @author gmcarstairs
  43  * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt
  44  * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
  45  */
  46 public class EmblFlatFile extends AlignFile // FileParse
  47 {
  48   private static final String QUOTE = "\"";
  49
  50   /**
  51    * A data bean class to hold values parsed from one CDS Feature (FT)
  52    */
  53   class CdsData
  54   {
  55     String translation; // from CDS feature /translation
  56
  57     String cdsLocation; // CDS /location raw value
  58
  59     int codonStart = 1; // from CDS /codon_start
  60
  61     String proteinName; // from CDS /product; used for protein description
  62
  63     String proteinId; // from CDS /protein_id
  64
  65     List<DBRefEntry> xrefs = new ArrayList<>(); // from CDS /db_xref qualifiers
  66
  67     Map<String, String> cdsProps = new Hashtable<>(); // CDS other qualifiers
  68   }
  69
  70   private static final String WHITESPACE = "\\s+";
  71
  72   private String sourceDb;
  73
  74   /*
  75    * values parsed from the EMBL flatfile record
  76    */
  77   private String accession; // from ID (first token)
  78
  79   private String version; // from ID (second token)
  80
  81   private String description; // from (first) DE line
  82
  83   private int length = 128; // from ID (7th token), with usable default
  84
  85   private List<DBRefEntry> dbrefs; // from DR
  86
  87   private String sequenceString; // from SQ lines
  88
  89   /*
  90    * parsed CDS data fields, keyed by protein_id
  91    */
  92   private Map<String, CdsData> cds;
  93
  94   /**
  95    * Constructor
  96    *
  97    * @param fp
  98    * @param sourceId
  99    * @throws IOException
 100    */
 101   public EmblFlatFile(FileParse fp, String sourceId) throws IOException
 102   {
 103     super(false, fp); // don't parse immediately
 104     this.sourceDb = sourceId;
 105     dbrefs = new ArrayList<>();
 106
 107     /*
 108      * using TreeMap gives CDS sequences in alphabetical, so readable, order
 109      */
 110     cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
 111   }
 112
 113   /**
 114    * Parses the flatfile, and if successful, saves as an annotated sequence
 115    * which may be retrieved by calling {@code getSequence()}
 116    *
 117    * @throws IOException
 118    */
 119   public void parse() throws IOException
 120   {
 121     String line = nextLine();
 122     while (line != null)
 123     {
 124       if (line.startsWith("ID"))
 125       {
 126         line = parseID(line);
 127       }
 128       else if (line.startsWith("DE"))
 129       {
 130         line = parseDE(line);
 131       }
 132       else if (line.startsWith("DR"))
 133       {
 134         line = parseDR(line);
 135       }
 136       else if (line.startsWith("SQ"))
 137       {
 138         line = parseSQ();
 139       }
 140       else if (line.startsWith("FT"))
 141       {
 142         line = parseFT(line);
 143       }
 144       else
 145       {
 146         line = nextLine();
 147       }
 148     }
 149     buildSequence();
 150   }
 151
 152   /**
 153    * Extracts and saves the primary accession and version (SV value) from an ID
 154    * line, or null if not found. Returns the next line after the one processed.
 155    *
 156    * @param line
 157    * @throws IOException
 158    */
 159   String parseID(String line) throws IOException
 160   {
 161     String[] tokens = line.substring(2).split(";");
 162
 163     /*
 164      * first is primary accession
 165      */
 166     String token = tokens[0].trim();
 167     if (!token.isEmpty())
 168     {
 169       this.accession = token;
 170     }
 171
 172     /*
 173      * second token is 'SV versionNo'
 174      */
 175     if (tokens.length > 1)
 176     {
 177       token = tokens[1].trim();
 178       if (token.startsWith("SV"))
 179       {
 180         String[] bits = token.trim().split(WHITESPACE);
 181         this.version = bits[bits.length - 1];
 182       }
 183     }
 184
 185     /*
 186      * seventh token is 'length BP'
 187      */
 188     if (tokens.length > 6)
 189     {
 190       token = tokens[6].trim();
 191       String[] bits = token.trim().split(WHITESPACE);
 192       try
 193       {
 194         this.length = Integer.valueOf(bits[0]);
 195       } catch (NumberFormatException e)
 196       {
 197         Cache.log.error("bad length read in flatfile, line: " + line);
 198       }
 199     }
 200
 201     return nextLine();
 202   }
 203
 204   /**
 205    * Reads sequence description from the first DE line found. Any trailing
 206    * period is discarded. If there are multiple DE lines, only the first (short
 207    * description) is read, the rest are ignored.
 208    *
 209    * @param line
 210    * @return
 211    * @throws IOException
 212    */
 213   String parseDE(String line) throws IOException
 214   {
 215     String desc = line.substring(2).trim();
 216     if (desc.endsWith("."))
 217     {
 218       desc = desc.substring(0, desc.length() - 1);
 219     }
 220     this.description = desc;
 221
 222     /*
 223      * pass over any additional DE lines
 224      */
 225     while ((line = nextLine()) != null)
 226     {
 227       if (!line.startsWith("DE"))
 228       {
 229         break;
 230       }
 231     }
 232
 233     return line;
 234   }
 235
 236   /**
 237    * Processes one DR line and saves as a DBRefEntry cross-reference. Returns
 238    * the line following the line processed.
 239    *
 240    * @param line
 241    * @throws IOException
 242    */
 243   String parseDR(String line) throws IOException
 244   {
 245     String[] tokens = line.substring(2).split(";");
 246     if (tokens.length > 1)
 247     {
 248       /*
 249        * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 250        */
 251       String db = tokens[0].trim();
 252       db = DBRefUtils.getCanonicalName(db);
 253       String acc = tokens[1].trim();
 254       if (acc.endsWith("."))
 255       {
 256         acc = acc.substring(0, acc.length() - 1);
 257       }
 258       String version = "0";
 259       if (tokens.length > 2)
 260       {
 261         String secondaryId = tokens[2].trim();
 262         if (!secondaryId.isEmpty())
 263         {
 264           // todo: is this right? secondary id is not a version number
 265           // version = secondaryId;
 266         }
 267       }
 268       this.dbrefs.add(new DBRefEntry(db, version, acc));
 269     }
 270
 271     return nextLine();
 272   }
 273
 274   /**
 275    * Reads and saves the sequence, read from the lines following the SQ line.
 276    * Whitespace and position counters are discarded. Returns the next line
 277    * following the sequence data (the next line that doesn't start with
 278    * whitespace).
 279    *
 280    * @throws IOException
 281    */
 282   String parseSQ() throws IOException
 283   {
 284     StringBuilder sb = new StringBuilder(this.length);
 285     String line = nextLine();
 286     while (line != null && line.startsWith(" "))
 287     {
 288       line = line.trim();
 289       String[] blocks = line.split(WHITESPACE);
 290
 291       /*
 292        * omit the last block (position counter) on each line
 293        */
 294       for (int i = 0; i < blocks.length - 1; i++)
 295       {
 296         sb.append(blocks[i]);
 297       }
 298       line = nextLine();
 299     }
 300     this.sequenceString = sb.toString();
 301
 302     return line;
 303   }
 304
 305   /**
 306    * Processes an FT line. If it declares a feature type of interest (currently,
 307    * only CDS is processed), processes all of the associated lines (feature
 308    * qualifiers), and returns the next line after that, otherwise simply returns
 309    * the next line.
 310    *
 311    * @param line
 312    * @return
 313    * @throws IOException
 314    */
 315   String parseFT(String line) throws IOException
 316   {
 317     String[] tokens = line.split(WHITESPACE);
 318     if (tokens.length < 3 || !"CDS".equals(tokens[1]))
 319     {
 320       return nextLine();
 321     }
 322
 323     CdsData data = new CdsData();
 324     data.cdsLocation = tokens[2];
 325
 326     line = nextLine();
 327     while (line != null)
 328     {
 329       if (!line.startsWith("FT    ")) // 4 spaces
 330       {
 331         // e.g. start of next feature "FT source..."
 332         break;
 333       }
 334
 335       /*
 336        * extract qualifier, e.g. FT    /protein_id="CAA37824.1"
 337        */
 338       int slashPos = line.indexOf('/');
 339       if (slashPos == -1)
 340       {
 341         Cache.log.error("Unexpected EMBL line ignored: " + line);
 342         continue;
 343       }
 344       int eqPos = line.indexOf('=', slashPos + 1);
 345       if (eqPos == -1)
 346       {
 347         // can happen, e.g. /ribosomal_slippage
 348 //        Cache.log.error("Unexpected EMBL line ignored: " + line);
 349         line = nextLine();
 350         continue;
 351       }
 352       String qualifier = line.substring(slashPos + 1, eqPos);
 353       String value = line.substring(eqPos + 1);
 354       if (value.startsWith(QUOTE) && value.endsWith(QUOTE))
 355       {
 356         value = value.substring(1, value.length() - 1);
 357       }
 358
 359       if ("protein_id".equals(qualifier))
 360       {
 361         data.proteinId = value;
 362         line = nextLine();
 363       }
 364       else if ("codon_start".equals(qualifier))
 365       {
 366         try
 367         {
 368           data.codonStart = Integer.parseInt(value.trim());
 369         } catch (NumberFormatException e)
 370         {
 371           Cache.log.error("Invalid codon_start in XML for " + this.accession
 372                   + ": " + e.getMessage());
 373         }
 374         line = nextLine();
 375       }
 376       else if ("db_xref".equals(qualifier))
 377       {
 378         String[] parts = value.split(":");
 379         if (parts.length == 2)
 380         {
 381           String db = parts[0].trim();
 382           db = DBRefUtils.getCanonicalName(db);
 383           DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
 384           data.xrefs.add(dbref);
 385         }
 386         line = nextLine();
 387       }
 388       else if ("product".equals(qualifier))
 389       {
 390         // sometimes name is returned e.g. for V00488
 391         data.proteinName = value;
 392         line = nextLine();
 393       }
 394       else if ("translation".equals(qualifier))
 395       {
 396         line = parseTranslation(value, data);
 397       }
 398       else if (!"".equals(value))
 399       {
 400         // throw anything else into the additional properties hash
 401         data.cdsProps.put(qualifier, value);
 402         line = nextLine();
 403       }
 404     }
 405
 406     if (data.proteinId != null)
 407     {
 408       this.cds.put(data.proteinId, data);
 409     }
 410     else
 411     {
 412       Cache.log.error("Ignoring CDS feature with no protein_id for "
 413               + sourceDb + ":" + accession);
 414     }
 415
 416     return line;
 417   }
 418
 419   /**
 420    * Reads and returns the CDS translation from one or more lines of the file,
 421    * and returns the next line after that
 422    *
 423    * @param value
 424    *          the first line of the translation (likely quoted)
 425    * @param data
 426    * @return
 427    * @throws IOException
 428    */
 429   String parseTranslation(String value, CdsData data) throws IOException
 430   {
 431     StringBuilder sb = new StringBuilder(this.length / 3 + 1);
 432     sb.append(value.replace(QUOTE, ""));
 433
 434     String line;
 435     while ((line = nextLine()) != null)
 436     {
 437       if (!line.startsWith("FT    "))
 438       {
 439         break; // reached next feature or other input line
 440       }
 441       String[] tokens = line.split(WHITESPACE);
 442       if (tokens.length < 2)
 443       {
 444         Cache.log.error("Ignoring bad EMBL line: " + line);
 445         break;
 446       }
 447       if (tokens[1].startsWith("/"))
 448       {
 449         break; // next feature qualifier
 450       }
 451       sb.append(tokens[1].replace(QUOTE, ""));
 452     }
 453
 454     data.translation = sb.toString();
 455
 456     return line;
 457   }
 458
 459   /**
 460    * Constructs and saves the sequence from parsed components
 461    */
 462   void buildSequence()
 463   {
 464     String name = this.accession;
 465     if (this.sourceDb != null)
 466     {
 467       name = this.sourceDb + "|" + name;
 468     }
 469     SequenceI seq = new Sequence(name, this.sequenceString);
 470     seq.setDescription(this.description);
 471
 472     /*
 473      * add a DBRef to itself
 474      */
 475     DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
 476     int[] startEnd = new int[] { 1, seq.getLength() };
 477     selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
 478     seq.addDBRef(selfRef);
 479
 480     for (DBRefEntry dbref : this.dbrefs)
 481     {
 482       seq.addDBRef(dbref);
 483     }
 484
 485     processCDSFeatures(seq);
 486
 487     seq.deriveSequence();
 488
 489     addSequence(seq);
 490   }
 491
 492   /**
 493    * Process the CDS features, including generation of cross-references and
 494    * mappings to the protein products (translation)
 495    *
 496    * @param seq
 497    */
 498   protected void processCDSFeatures(SequenceI seq)
 499   {
 500     /*
 501      * record protein products found to avoid duplication i.e. >1 CDS with
 502      * the same /protein_id [though not sure I can find an example of this]
 503      */
 504     Map<String, SequenceI> proteins = new HashMap<>();
 505     for (CdsData data : cds.values())
 506     {
 507       processCDSFeature(seq, data, proteins);
 508     }
 509   }
 510
 511   /**
 512    * Processes data for one parsed CDS feature to
 513    * <ul>
 514    * <li>create a protein product sequence for the translation</li>
 515    * <li>create a cross-reference to protein with mapping from dna</li>
 516    * <li>add a CDS feature to the sequence for each CDS start-end range</li>
 517    * <li>add any CDS dbrefs to the sequence and to the protein product</li>
 518    * </ul>
 519    *
 520    * @param SequenceI
 521    *          dna
 522    * @param proteins
 523    *          map of protein products so far derived from CDS data
 524    */
 525   void processCDSFeature(SequenceI dna, CdsData data,
 526           Map<String, SequenceI> proteins)
 527   {
 528     /*
 529      * parse location into a list of [start, end, start, end] positions
 530      */
 531     int[] exons = getCdsRanges(this.accession, data.cdsLocation);
 532
 533     MapList maplist = buildMappingToProtein(dna, exons, data);
 534
 535     int exonNumber = 0;
 536
 537     for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
 538     {
 539       int exonStart = exons[xint];
 540       int exonEnd = exons[xint + 1];
 541       int begin = Math.min(exonStart, exonEnd);
 542       int end = Math.max(exonStart, exonEnd);
 543       exonNumber++;
 544       String desc = String.format("Exon %d for protein EMBLCDS:%s",
 545               exonNumber, data.proteinId);
 546
 547       SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
 548               this.sourceDb);
 549       for (Entry<String, String> val : data.cdsProps.entrySet())
 550       {
 551         sf.setValue(val.getKey(), val.getValue());
 552       }
 553
 554       sf.setEnaLocation(data.cdsLocation);
 555       boolean forwardStrand = exonStart <= exonEnd;
 556       sf.setStrand(forwardStrand ? "+" : "-");
 557       sf.setPhase(String.valueOf(data.codonStart - 1));
 558       sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 559       sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
 560
 561       dna.addSequenceFeature(sf);
 562     }
 563
 564     boolean hasUniprotDbref = false;
 565     for (DBRefEntry xref : data.xrefs)
 566     {
 567       dna.addDBRef(xref);
 568       if (xref.getSource().equals(DBRefSource.UNIPROT))
 569       {
 570         /*
 571          * construct (or find) the sequence for (data.protein_id, data.translation)
 572          */
 573         SequenceI protein = buildProteinProduct(dna, xref, data, proteins);
 574         Mapping map = new Mapping(protein, maplist);
 575         map.setMappedFromId(data.proteinId);
 576         xref.setMap(map);
 577
 578         /*
 579          * add DBRefs with mappings from dna to protein and the inverse
 580          */
 581         DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession);
 582         db1.setMap(new Mapping(dna, maplist.getInverse()));
 583         protein.addDBRef(db1);
 584
 585         hasUniprotDbref = true;
 586       }
 587     }
 588
 589     /*
 590      * if we have a product (translation) but no explicit Uniprot dbref
 591      * (example: EMBL M19487 protein_id AAB02592.1)
 592      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 593      */
 594     if (!hasUniprotDbref)
 595     {
 596       SequenceI protein = proteins.get(data.proteinId);
 597       if (protein == null)
 598       {
 599         protein = new Sequence(data.proteinId, data.translation);
 600         protein.setDescription(data.proteinName);
 601         proteins.put(data.proteinId, protein);
 602       }
 603       // assuming CDSPROTEIN sequence version = dna version (?!)
 604       DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 605               this.version, data.proteinId);
 606       protein.addDBRef(db1);
 607
 608       DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 609               DBRefSource.EMBLCDSProduct, this.version, data.proteinId);
 610       Mapping map = new Mapping(protein, maplist);
 611       map.setMappedFromId(data.proteinId);
 612       dnaToEmblProteinRef.setMap(map);
 613       dna.addDBRef(dnaToEmblProteinRef);
 614      }
 615
 616     /*
 617      * comment brought forward from EmblXmlSource, lines 447-451:
 618      * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL
 619      * sequence with the exon  map; if given a dataset reference, search
 620      * dataset for parent EMBL sequence if it exists and set its map;
 621      * make a new feature annotating the coding contig
 622      */
 623   }
 624
 625   /**
 626    * Computes a mapping from CDS positions in DNA sequence to protein product
 627    * positions, with allowance for stop codon or incomplete start codon
 628    *
 629    * @param dna
 630    * @param exons
 631    * @param data
 632    * @return
 633    */
 634   MapList buildMappingToProtein(final SequenceI dna, final int[] exons,
 635           final CdsData data)
 636   {
 637     MapList dnaToProteinMapping = null;
 638     int peptideLength = data.translation.length();
 639
 640     int[] proteinRange = new int[] { 1, peptideLength };
 641     if (exons != null && exons.length > 0)
 642     {
 643       /*
 644        * We were able to parse 'location'; do a final
 645        * product length truncation check
 646        */
 647       int[] cdsRanges = adjustForProteinLength(peptideLength, exons);
 648       dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 649     }
 650     else
 651     {
 652       /*
 653        * workaround until we handle all 'location' formats fully
 654        * e.g. X53828.1:60..1058 or <123..>289
 655        */
 656       Cache.log.error(String.format(
 657               "Implementation Notice: EMBLCDS location '%s'not properly supported yet"
 658                       + " - Making up the CDNA region of (%s:%s)... may be incorrect",
 659               data.cdsLocation, sourceDb, this.accession));
 660
 661       int completeCodonsLength = 1 - data.codonStart + dna.getLength();
 662       int mappedDnaEnd = dna.getEnd();
 663       if (peptideLength * 3 == completeCodonsLength)
 664       {
 665         // this might occur for CDS sequences where no features are marked
 666         Cache.log.warn("Assuming no stop codon at end of cDNA fragment");
 667         mappedDnaEnd = dna.getEnd();
 668       }
 669       else if ((peptideLength + 1) * 3 == completeCodonsLength)
 670       {
 671         Cache.log.warn("Assuming stop codon at end of cDNA fragment");
 672         mappedDnaEnd = dna.getEnd() - 3;
 673       }
 674
 675       if (mappedDnaEnd != -1)
 676       {
 677         int[] cdsRanges = new int[] {
 678             dna.getStart() + (data.codonStart - 1), mappedDnaEnd };
 679         dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 680       }
 681     }
 682
 683     return dnaToProteinMapping;
 684   }
 685
 686   /**
 687    * Constructs a sequence for the protein product for the CDS data (if there is
 688    * one), and dbrefs with mappings from CDS to protein and the reverse
 689    *
 690    * @param dna
 691    * @param xref
 692    * @param data
 693    * @param proteins
 694    * @return
 695    */
 696   SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref,
 697           CdsData data, Map<String, SequenceI> proteins)
 698   {
 699     /*
 700      * check we have some data to work with
 701      */
 702     if (data.proteinId == null || data.translation == null)
 703     {
 704       return null;
 705     }
 706
 707     /*
 708      * Construct the protein sequence (if not already seen)
 709      */
 710     String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId();
 711     SequenceI protein = proteins.get(proteinSeqName);
 712     if (protein == null)
 713     {
 714       protein = new Sequence(proteinSeqName, data.translation, 1,
 715               data.translation.length());
 716       protein.setDescription(data.proteinName != null ? data.proteinName
 717               : "Protein Product from " + sourceDb);
 718       proteins.put(proteinSeqName, protein);
 719     }
 720
 721     return protein;
 722   }
 723
 724   /**
 725    * Returns the CDS location as a single array of [start, end, start, end...]
 726    * positions. If on the reverse strand, these will be in descending order.
 727    *
 728    * @param accession
 729    * @param location
 730    * @return
 731    */
 732   protected int[] getCdsRanges(String accession, String location)
 733   {
 734     if (location == null)
 735     {
 736       return new int[] {};
 737     }
 738
 739     try
 740     {
 741       List<int[]> ranges = DnaUtils.parseLocation(location);
 742       return MappingUtils.listToArray(ranges);
 743     } catch (ParseException e)
 744     {
 745       Cache.log.warn(
 746               String.format("Not parsing inexact CDS location %s in ENA %s",
 747                       location, accession));
 748       return new int[] {};
 749     }
 750   }
 751
 752   /**
 753    * Output (print) is not implemented for EMBL flat file format
 754    */
 755   @Override
 756   public String print(SequenceI[] seqs, boolean jvsuffix)
 757   {
 758     return null;
 759   }
 760
 761   /**
 762    * Truncates (if necessary) the exon intervals to match 3 times the length of
 763    * the protein; also accepts 3 bases longer (for stop codon not included in
 764    * protein)
 765    *
 766    * @param proteinLength
 767    * @param exon
 768    *          an array of [start, end, start, end...] intervals
 769    * @return the same array (if unchanged) or a truncated copy
 770    */
 771   static int[] adjustForProteinLength(int proteinLength, int[] exon)
 772   {
 773     if (proteinLength <= 0 || exon == null)
 774     {
 775       return exon;
 776     }
 777     int expectedCdsLength = proteinLength * 3;
 778     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 779
 780     /*
 781      * if exon length matches protein, or is shorter, or longer by the
 782      * length of a stop codon (3 bases), then leave it unchanged
 783      */
 784     if (expectedCdsLength >= exonLength
 785             || expectedCdsLength == exonLength - 3)
 786     {
 787       return exon;
 788     }
 789
 790     int origxon[];
 791     int sxpos = -1;
 792     int endxon = 0;
 793     origxon = new int[exon.length];
 794     System.arraycopy(exon, 0, origxon, 0, exon.length);
 795     int cdspos = 0;
 796     for (int x = 0; x < exon.length; x += 2)
 797     {
 798       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 799       if (expectedCdsLength <= cdspos)
 800       {
 801         // advanced beyond last codon.
 802         sxpos = x;
 803         if (expectedCdsLength != cdspos)
 804         {
 805           // System.err
 806           // .println("Truncating final exon interval on region by "
 807           // + (cdspos - cdslength));
 808         }
 809
 810         /*
 811          * shrink the final exon - reduce end position if forward
 812          * strand, increase it if reverse
 813          */
 814         if (exon[x + 1] >= exon[x])
 815         {
 816           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 817         }
 818         else
 819         {
 820           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 821         }
 822         break;
 823       }
 824     }
 825
 826     if (sxpos != -1)
 827     {
 828       // and trim the exon interval set if necessary
 829       int[] nxon = new int[sxpos + 2];
 830       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 831       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 832                                 // set
 833       exon = nxon;
 834     }
 835     return exon;
 836   }
 837 }