src/jalview/io/EmblFlatFile.java

   1 package jalview.io;
   2
   3 import java.io.IOException;
   4 import java.text.ParseException;
   5 import java.util.ArrayList;
   6 import java.util.Arrays;
   7 import java.util.HashMap;
   8 import java.util.Hashtable;
   9 import java.util.List;
  10 import java.util.Map;
  11 import java.util.Map.Entry;
  12 import java.util.TreeMap;
  13
  14 import jalview.bin.Cache;
  15 import jalview.datamodel.DBRefEntry;
  16 import jalview.datamodel.DBRefSource;
  17 import jalview.datamodel.FeatureProperties;
  18 import jalview.datamodel.Mapping;
  19 import jalview.datamodel.Sequence;
  20 import jalview.datamodel.SequenceFeature;
  21 import jalview.datamodel.SequenceI;
  22 import jalview.util.DBRefUtils;
  23 import jalview.util.DnaUtils;
  24 import jalview.util.MapList;
  25 import jalview.util.MappingUtils;
  26
  27 /**
  28  * A class that provides selective parsing of the EMBL flatfile format.
  29  * <p>
  30  * The initial implementation is limited to extracting fields used by Jalview
  31  * after fetching an EMBL or EMBLCDS entry:
  32  *
  33  * <pre>
  34  * accession, version, sequence, xref
  35  * and (for CDS feature) location, protein_id, product, codon_start, translation
  36  * </pre>
  37  *
  38  * For a complete parser, it may be best to adopt that provided in
  39  * https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile
  40  * (but note this has a dependency on the Apache Commons library)
  41  *
  42  * @author gmcarstairs
  43  * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt
  44  * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
  45  */
  46 public class EmblFlatFile extends AlignFile // FileParse
  47 {
  48   private static final String QUOTE = "\"";
  49
  50   /**
  51    * A data bean class to hold values parsed from one CDS Feature (FT)
  52    */
  53   class CdsData
  54   {
  55     String translation; // from CDS feature /translation
  56
  57     String cdsLocation; // CDS /location raw value
  58
  59     int codonStart = 1; // from CDS /codon_start
  60
  61     String proteinName; // from CDS /product; used for protein description
  62
  63     String proteinId; // from CDS /protein_id
  64
  65     List<DBRefEntry> xrefs = new ArrayList<>(); // from CDS /db_xref qualifiers
  66
  67     Map<String, String> cdsProps = new Hashtable<>(); // CDS other qualifiers
  68   }
  69
  70   private static final String WHITESPACE = "\\s+";
  71
  72   private String sourceDb;
  73
  74   /*
  75    * values parsed from the EMBL flatfile record
  76    */
  77   private String accession; // from ID (first token)
  78
  79   private String version; // from ID (second token)
  80
  81   private String description; // from (first) DE line
  82
  83   private int length = 128; // from ID (7th token), with usable default
  84
  85   private List<DBRefEntry> dbrefs; // from DR
  86
  87   private String sequenceString; // from SQ lines
  88
  89   /*
  90    * parsed CDS data fields, keyed by protein_id
  91    */
  92   private Map<String, CdsData> cds;
  93
  94   /**
  95    * Constructor
  96    *
  97    * @param fp
  98    * @param sourceId
  99    * @throws IOException
 100    */
 101   public EmblFlatFile(FileParse fp, String sourceId) throws IOException
 102   {
 103     super(false, fp); // don't parse immediately
 104     this.sourceDb = sourceId;
 105     dbrefs = new ArrayList<>();
 106
 107     /*
 108      * using TreeMap gives CDS sequences in alphabetical, so readable, order
 109      */
 110     cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
 111   }
 112
 113   /**
 114    * Parses the flatfile, and if successful, saves as an annotated sequence
 115    * which may be retrieved by calling {@code getSequence()}
 116    *
 117    * @throws IOException
 118    */
 119   public void parse() throws IOException
 120   {
 121     String line = nextLine();
 122     while (line != null)
 123     {
 124       if (line.startsWith("ID"))
 125       {
 126         line = parseID(line);
 127       }
 128       else if (line.startsWith("DE"))
 129       {
 130         line = parseDE(line);
 131       }
 132       else if (line.startsWith("DR"))
 133       {
 134         line = parseDR(line);
 135       }
 136       else if (line.startsWith("SQ"))
 137       {
 138         line = parseSQ();
 139       }
 140       else if (line.startsWith("FT"))
 141       {
 142         line = parseFT(line);
 143       }
 144       else
 145       {
 146         line = nextLine();
 147       }
 148     }
 149     buildSequence();
 150   }
 151
 152   /**
 153    * Extracts and saves the primary accession and version (SV value) from an ID
 154    * line, or null if not found. Returns the next line after the one processed.
 155    *
 156    * @param line
 157    * @throws IOException
 158    */
 159   String parseID(String line) throws IOException
 160   {
 161     String[] tokens = line.substring(2).split(";");
 162
 163     /*
 164      * first is primary accession
 165      */
 166     String token = tokens[0].trim();
 167     if (!token.isEmpty())
 168     {
 169       this.accession = token;
 170     }
 171
 172     /*
 173      * second token is 'SV versionNo'
 174      */
 175     if (tokens.length > 1)
 176     {
 177       token = tokens[1].trim();
 178       if (token.startsWith("SV"))
 179       {
 180         String[] bits = token.trim().split(WHITESPACE);
 181         this.version = bits[bits.length - 1];
 182       }
 183     }
 184
 185     /*
 186      * seventh token is 'length BP'
 187      */
 188     if (tokens.length > 6)
 189     {
 190       token = tokens[6].trim();
 191       String[] bits = token.trim().split(WHITESPACE);
 192       try
 193       {
 194         this.length = Integer.valueOf(bits[0]);
 195       } catch (NumberFormatException e)
 196       {
 197         Cache.log.error("bad length read in flatfile, line: " + line);
 198       }
 199     }
 200
 201     return nextLine();
 202   }
 203
 204   /**
 205    * Reads sequence description from the first DE line found. Any trailing
 206    * period is discarded. If there are multiple DE lines, only the first (short
 207    * description) is read, the rest are ignored.
 208    *
 209    * @param line
 210    * @return
 211    * @throws IOException
 212    */
 213   String parseDE(String line) throws IOException
 214   {
 215     String desc = line.substring(2).trim();
 216     if (desc.endsWith("."))
 217     {
 218       desc = desc.substring(0, desc.length() - 1);
 219     }
 220     this.description = desc;
 221
 222     /*
 223      * pass over any additional DE lines
 224      */
 225     while ((line = nextLine()) != null)
 226     {
 227       if (!line.startsWith("DE"))
 228       {
 229         break;
 230       }
 231     }
 232
 233     return line;
 234   }
 235
 236   /**
 237    * Processes one DR line and saves as a DBRefEntry cross-reference. Returns
 238    * the line following the line processed.
 239    *
 240    * @param line
 241    * @throws IOException
 242    */
 243   String parseDR(String line) throws IOException
 244   {
 245     String[] tokens = line.substring(2).split(";");
 246     if (tokens.length > 1)
 247     {
 248       /*
 249        * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 250        */
 251       String db = tokens[0].trim();
 252       db = DBRefUtils.getCanonicalName(db);
 253       String acc = tokens[1].trim();
 254       if (acc.endsWith("."))
 255       {
 256         acc = acc.substring(0, acc.length() - 1);
 257       }
 258       String version = "0";
 259       if (tokens.length > 2)
 260       {
 261         String secondaryId = tokens[2].trim();
 262         if (!secondaryId.isEmpty())
 263         {
 264           // todo: is this right? secondary id is not a version number
 265           // version = secondaryId;
 266         }
 267       }
 268       this.dbrefs.add(new DBRefEntry(db, version, acc));
 269     }
 270
 271     return nextLine();
 272   }
 273
 274   /**
 275    * Reads and saves the sequence, read from the lines following the SQ line.
 276    * Whitespace and position counters are discarded. Returns the next line
 277    * following the sequence data (the next line that doesn't start with
 278    * whitespace).
 279    *
 280    * @throws IOException
 281    */
 282   String parseSQ() throws IOException
 283   {
 284     StringBuilder sb = new StringBuilder(this.length);
 285     String line = nextLine();
 286     while (line != null && line.startsWith(" "))
 287     {
 288       line = line.trim();
 289       String[] blocks = line.split(WHITESPACE);
 290
 291       /*
 292        * omit the last block (position counter) on each line
 293        */
 294       for (int i = 0; i < blocks.length - 1; i++)
 295       {
 296         sb.append(blocks[i]);
 297       }
 298       line = nextLine();
 299     }
 300     this.sequenceString = sb.toString();
 301
 302     return line;
 303   }
 304
 305   /**
 306    * Processes an FT line. If it declares a feature type of interest (currently,
 307    * only CDS is processed), processes all of the associated lines (feature
 308    * qualifiers), and returns the next line after that, otherwise simply returns
 309    * the next line.
 310    *
 311    * @param line
 312    * @return
 313    * @throws IOException
 314    */
 315   String parseFT(String line) throws IOException
 316   {
 317     String[] tokens = line.split(WHITESPACE);
 318     if (tokens.length < 3 || !"CDS".equals(tokens[1]))
 319     {
 320       return nextLine();
 321     }
 322
 323     CdsData data = new CdsData();
 324     data.cdsLocation = tokens[2];
 325
 326     line = nextLine();
 327     while (line != null)
 328     {
 329       if (!line.startsWith("FT    ")) // 4 spaces
 330       {
 331         // e.g. start of next feature "FT source..."
 332         break;
 333       }
 334
 335       /*
 336        * extract qualifier, e.g. FT    /protein_id="CAA37824.1"
 337        */
 338       int slashPos = line.indexOf('/');
 339       if (slashPos == -1)
 340       {
 341         Cache.log.error("Unexpected EMBL line ignored: " + line);
 342         continue;
 343       }
 344       int eqPos = line.indexOf('=', slashPos + 1);
 345       if (eqPos == -1)
 346       {
 347         // can happen, e.g. /ribosomal_slippage
 348 //        Cache.log.error("Unexpected EMBL line ignored: " + line);
 349         line = nextLine();
 350         continue;
 351       }
 352       String qualifier = line.substring(slashPos + 1, eqPos);
 353       String value = line.substring(eqPos + 1);
 354       if (value.startsWith(QUOTE) && value.endsWith(QUOTE))
 355       {
 356         value = value.substring(1, value.length() - 1);
 357       }
 358
 359       if ("protein_id".equals(qualifier))
 360       {
 361         data.proteinId = value;
 362         line = nextLine();
 363       }
 364       else if ("codon_start".equals(qualifier))
 365       {
 366         try
 367         {
 368           data.codonStart = Integer.parseInt(value.trim());
 369         } catch (NumberFormatException e)
 370         {
 371           Cache.log.error("Invalid codon_start in XML for " + this.accession
 372                   + ": " + e.getMessage());
 373         }
 374         line = nextLine();
 375       }
 376       else if ("db_xref".equals(qualifier))
 377       {
 378         String[] parts = value.split(":");
 379         if (parts.length == 2)
 380         {
 381           String db = parts[0].trim();
 382           db = DBRefUtils.getCanonicalName(db);
 383           DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
 384           data.xrefs.add(dbref);
 385         }
 386         line = nextLine();
 387       }
 388       else if ("product".equals(qualifier))
 389       {
 390         // sometimes name is returned e.g. for V00488
 391         data.proteinName = value;
 392         line = nextLine();
 393       }
 394       else if ("translation".equals(qualifier))
 395       {
 396         line = parseTranslation(value, data);
 397       }
 398       else if (!"".equals(value))
 399       {
 400         // throw anything else into the additional properties hash
 401         data.cdsProps.put(qualifier, value);
 402         line = nextLine();
 403       }
 404     }
 405
 406     if (data.proteinId != null)
 407     {
 408       this.cds.put(data.proteinId, data);
 409     }
 410     else
 411     {
 412       Cache.log.error("Ignoring CDS feature with no protein_id for "
 413               + sourceDb + ":" + accession);
 414     }
 415
 416     return line;
 417   }
 418
 419   /**
 420    * Reads and returns the CDS translation from one or more lines of the file,
 421    * and returns the next line after that
 422    *
 423    * @param value
 424    *          the first line of the translation (likely quoted)
 425    * @param data
 426    * @return
 427    * @throws IOException
 428    */
 429   String parseTranslation(String value, CdsData data) throws IOException
 430   {
 431     StringBuilder sb = new StringBuilder(this.length / 3 + 1);
 432     sb.append(value.replace(QUOTE, ""));
 433
 434     String line;
 435     while ((line = nextLine()) != null)
 436     {
 437       if (!line.startsWith("FT    "))
 438       {
 439         break; // reached next feature or other input line
 440       }
 441       String[] tokens = line.split(WHITESPACE);
 442       if (tokens.length < 2)
 443       {
 444         Cache.log.error("Ignoring bad EMBL line: " + line);
 445         break;
 446       }
 447       if (tokens[1].startsWith("/"))
 448       {
 449         break; // next feature qualifier
 450       }
 451       sb.append(tokens[1].replace(QUOTE, ""));
 452     }
 453
 454     data.translation = sb.toString();
 455
 456     return line;
 457   }
 458
 459   /**
 460    * Constructs and saves the sequence from parsed components
 461    */
 462   void buildSequence()
 463   {
 464     String name = this.accession;
 465     if (this.sourceDb != null)
 466     {
 467       name = this.sourceDb + "|" + name;
 468     }
 469     SequenceI seq = new Sequence(name, this.sequenceString);
 470     seq.setDescription(this.description);
 471
 472     /*
 473      * add a DBRef to itself
 474      */
 475     DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
 476     int[] startEnd = new int[] { 1, seq.getLength() };
 477     selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
 478     seq.addDBRef(selfRef);
 479
 480     for (DBRefEntry dbref : this.dbrefs)
 481     {
 482       seq.addDBRef(dbref);
 483     }
 484
 485     processCDSFeatures(seq);
 486
 487     seq.deriveSequence();
 488
 489     addSequence(seq);
 490   }
 491
 492   /**
 493    * Process the CDS features, including generation of cross-references and
 494    * mappings to the protein products (translation)
 495    *
 496    * @param seq
 497    */
 498   protected void processCDSFeatures(SequenceI seq)
 499   {
 500     /*
 501      * record protein products found to avoid duplication i.e. >1 CDS with
 502      * the same /protein_id [though not sure I can find an example of this]
 503      */
 504     Map<String, SequenceI> proteins = new HashMap<>();
 505     for (CdsData data : cds.values())
 506     {
 507       processCDSFeature(seq, data, proteins);
 508     }
 509   }
 510
 511   /**
 512    * Processes data for one parsed CDS feature to
 513    * <ul>
 514    * <li>create a protein product sequence for the translation</li>
 515    * <li>create a cross-reference to protein with mapping from dna</li>
 516    * <li>add a CDS feature to the sequence for each CDS start-end range</li>
 517    * <li>add any CDS dbrefs to the sequence and to the protein product</li>
 518    * </ul>
 519    *
 520    * @param SequenceI
 521    *          dna
 522    * @param proteins
 523    *          map of protein products so far derived from CDS data
 524    */
 525   void processCDSFeature(SequenceI dna, CdsData data,
 526           Map<String, SequenceI> proteins)
 527   {
 528     /*
 529      * parse location into a list of [start, end, start, end] positions
 530      */
 531     int[] exons = getCdsRanges(this.accession, data.cdsLocation);
 532
 533     MapList maplist = buildMappingToProtein(dna, exons, data);
 534
 535     int exonNumber = 0;
 536
 537     for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
 538     {
 539       int exonStart = exons[xint];
 540       int exonEnd = exons[xint + 1];
 541       int begin = Math.min(exonStart, exonEnd);
 542       int end = Math.max(exonStart, exonEnd);
 543       exonNumber++;
 544       String desc = String.format("Exon %d for protein EMBLCDS:%s",
 545               exonNumber, data.proteinId);
 546
 547       SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
 548               this.sourceDb);
 549       for (Entry<String, String> val : data.cdsProps.entrySet())
 550       {
 551         sf.setValue(val.getKey(), val.getValue());
 552       }
 553
 554       sf.setEnaLocation(data.cdsLocation);
 555       boolean forwardStrand = exonStart <= exonEnd;
 556       sf.setStrand(forwardStrand ? "+" : "-");
 557       sf.setPhase(String.valueOf(data.codonStart - 1));
 558       sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 559       sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
 560
 561       dna.addSequenceFeature(sf);
 562     }
 563
 564     boolean hasUniprotDbref = false;
 565     for (DBRefEntry xref : data.xrefs)
 566     {
 567       dna.addDBRef(xref);
 568       if (xref.getSource().equals(DBRefSource.UNIPROT))
 569       {
 570         /*
 571          * construct (or find) the sequence for (data.protein_id, data.translation)
 572          */
 573         SequenceI protein = buildProteinProduct(dna, xref, data, proteins);
 574         Mapping map = new Mapping(protein, maplist);
 575         map.setMappedFromId(data.proteinId);
 576         xref.setMap(map);
 577
 578         /*
 579          * add DBRefs with mappings from dna to protein and the inverse
 580          */
 581         DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession);
 582         db1.setMap(new Mapping(dna, maplist.getInverse()));
 583         protein.addDBRef(db1);
 584
 585         hasUniprotDbref = true;
 586       }
 587     }
 588
 589     /*
 590      * if we have a product (translation) but no explicit Uniprot dbref
 591      * (example: EMBL M19487 protein_id AAB02592.1)
 592      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 593      */
 594     if (!hasUniprotDbref)
 595     {
 596       SequenceI protein = proteins.get(data.proteinId);
 597       if (protein == null)
 598       {
 599         protein = new Sequence(data.proteinId, data.translation);
 600         proteins.put(data.proteinId, protein);
 601       }
 602       // assuming CDSPROTEIN sequence version = dna version (?!)
 603       DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 604               this.version, data.proteinId);
 605       protein.addDBRef(db1);
 606
 607       DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 608               DBRefSource.EMBLCDSProduct, this.version, data.proteinId);
 609       Mapping map = new Mapping(protein, maplist);
 610       map.setMappedFromId(data.proteinId);
 611       dnaToEmblProteinRef.setMap(map);
 612       dna.addDBRef(dnaToEmblProteinRef);
 613      }
 614
 615     /*
 616      * comment brought forward from EmblXmlSource, lines 447-451:
 617      * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL
 618      * sequence with the exon  map; if given a dataset reference, search
 619      * dataset for parent EMBL sequence if it exists and set its map;
 620      * make a new feature annotating the coding contig
 621      */
 622   }
 623
 624   /**
 625    * Computes a mapping from CDS positions in DNA sequence to protein product
 626    * positions, with allowance for stop codon or incomplete start codon
 627    *
 628    * @param dna
 629    * @param exons
 630    * @param data
 631    * @return
 632    */
 633   MapList buildMappingToProtein(final SequenceI dna, final int[] exons,
 634           final CdsData data)
 635   {
 636     MapList dnaToProteinMapping = null;
 637     int peptideLength = data.translation.length();
 638
 639     int[] proteinRange = new int[] { 1, peptideLength };
 640     if (exons != null && exons.length > 0)
 641     {
 642       /*
 643        * We were able to parse 'location'; do a final
 644        * product length truncation check
 645        */
 646       int[] cdsRanges = adjustForProteinLength(peptideLength, exons);
 647       dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 648     }
 649     else
 650     {
 651       /*
 652        * workaround until we handle all 'location' formats fully
 653        * e.g. X53828.1:60..1058 or <123..>289
 654        */
 655       Cache.log.error(String.format(
 656               "Implementation Notice: EMBLCDS location '%s'not properly supported yet"
 657                       + " - Making up the CDNA region of (%s:%s)... may be incorrect",
 658               data.cdsLocation, sourceDb, this.accession));
 659
 660       int completeCodonsLength = 1 - data.codonStart + dna.getLength();
 661       int mappedDnaEnd = dna.getEnd();
 662       if (peptideLength * 3 == completeCodonsLength)
 663       {
 664         // this might occur for CDS sequences where no features are marked
 665         Cache.log.warn("Assuming no stop codon at end of cDNA fragment");
 666         mappedDnaEnd = dna.getEnd();
 667       }
 668       else if ((peptideLength + 1) * 3 == completeCodonsLength)
 669       {
 670         Cache.log.warn("Assuming stop codon at end of cDNA fragment");
 671         mappedDnaEnd = dna.getEnd() - 3;
 672       }
 673
 674       if (mappedDnaEnd != -1)
 675       {
 676         int[] cdsRanges = new int[] {
 677             dna.getStart() + (data.codonStart - 1), mappedDnaEnd };
 678         dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 679       }
 680     }
 681
 682     return dnaToProteinMapping;
 683   }
 684
 685   /**
 686    * Constructs a sequence for the protein product for the CDS data (if there is
 687    * one), and dbrefs with mappings from CDS to protein and the reverse
 688    *
 689    * @param dna
 690    * @param xref
 691    * @param data
 692    * @param proteins
 693    * @return
 694    */
 695   SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref,
 696           CdsData data, Map<String, SequenceI> proteins)
 697   {
 698     /*
 699      * check we have some data to work with
 700      */
 701     if (data.proteinId == null || data.translation == null)
 702     {
 703       return null;
 704     }
 705
 706     /*
 707      * Construct the protein sequence (if not already seen)
 708      */
 709     String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId();
 710     SequenceI protein = proteins.get(proteinSeqName);
 711     if (protein == null)
 712     {
 713       protein = new Sequence(proteinSeqName, data.translation, 1,
 714               data.translation.length());
 715       protein.setDescription(data.proteinName != null ? data.proteinName
 716               : "Protein Product from " + sourceDb);
 717       proteins.put(proteinSeqName, protein);
 718     }
 719
 720     return protein;
 721   }
 722
 723   /**
 724    * Returns the CDS location as a single array of [start, end, start, end...]
 725    * positions. If on the reverse strand, these will be in descending order.
 726    *
 727    * @param accession
 728    * @param location
 729    * @return
 730    */
 731   protected int[] getCdsRanges(String accession, String location)
 732   {
 733     if (location == null)
 734     {
 735       return new int[] {};
 736     }
 737
 738     try
 739     {
 740       List<int[]> ranges = DnaUtils.parseLocation(location);
 741       return MappingUtils.listToArray(ranges);
 742     } catch (ParseException e)
 743     {
 744       Cache.log.warn(
 745               String.format("Not parsing inexact CDS location %s in ENA %s",
 746                       location, accession));
 747       return new int[] {};
 748     }
 749   }
 750
 751   /**
 752    * Output (print) is not implemented for EMBL flat file format
 753    */
 754   @Override
 755   public String print(SequenceI[] seqs, boolean jvsuffix)
 756   {
 757     return null;
 758   }
 759
 760   /**
 761    * Truncates (if necessary) the exon intervals to match 3 times the length of
 762    * the protein; also accepts 3 bases longer (for stop codon not included in
 763    * protein)
 764    *
 765    * @param proteinLength
 766    * @param exon
 767    *          an array of [start, end, start, end...] intervals
 768    * @return the same array (if unchanged) or a truncated copy
 769    */
 770   static int[] adjustForProteinLength(int proteinLength, int[] exon)
 771   {
 772     if (proteinLength <= 0 || exon == null)
 773     {
 774       return exon;
 775     }
 776     int expectedCdsLength = proteinLength * 3;
 777     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 778
 779     /*
 780      * if exon length matches protein, or is shorter, or longer by the
 781      * length of a stop codon (3 bases), then leave it unchanged
 782      */
 783     if (expectedCdsLength >= exonLength
 784             || expectedCdsLength == exonLength - 3)
 785     {
 786       return exon;
 787     }
 788
 789     int origxon[];
 790     int sxpos = -1;
 791     int endxon = 0;
 792     origxon = new int[exon.length];
 793     System.arraycopy(exon, 0, origxon, 0, exon.length);
 794     int cdspos = 0;
 795     for (int x = 0; x < exon.length; x += 2)
 796     {
 797       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 798       if (expectedCdsLength <= cdspos)
 799       {
 800         // advanced beyond last codon.
 801         sxpos = x;
 802         if (expectedCdsLength != cdspos)
 803         {
 804           // System.err
 805           // .println("Truncating final exon interval on region by "
 806           // + (cdspos - cdslength));
 807         }
 808
 809         /*
 810          * shrink the final exon - reduce end position if forward
 811          * strand, increase it if reverse
 812          */
 813         if (exon[x + 1] >= exon[x])
 814         {
 815           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 816         }
 817         else
 818         {
 819           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 820         }
 821         break;
 822       }
 823     }
 824
 825     if (sxpos != -1)
 826     {
 827       // and trim the exon interval set if necessary
 828       int[] nxon = new int[sxpos + 2];
 829       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 830       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 831                                 // set
 832       exon = nxon;
 833     }
 834     return exon;
 835   }
 836 }