src/jalview/io/EmblFlatFile.java

   1 package jalview.io;
   2
   3 import java.io.IOException;
   4 import java.text.ParseException;
   5 import java.util.ArrayList;
   6 import java.util.Arrays;
   7 import java.util.HashMap;
   8 import java.util.Hashtable;
   9 import java.util.List;
  10 import java.util.Map;
  11 import java.util.Map.Entry;
  12 import java.util.TreeMap;
  13
  14 import jalview.bin.Cache;
  15 import jalview.datamodel.DBRefEntry;
  16 import jalview.datamodel.DBRefSource;
  17 import jalview.datamodel.FeatureProperties;
  18 import jalview.datamodel.Mapping;
  19 import jalview.datamodel.Sequence;
  20 import jalview.datamodel.SequenceFeature;
  21 import jalview.datamodel.SequenceI;
  22 import jalview.util.DBRefUtils;
  23 import jalview.util.DnaUtils;
  24 import jalview.util.MapList;
  25 import jalview.util.MappingUtils;
  26
  27 /**
  28  * A class that provides selective parsing of the EMBL flatfile format.
  29  * <p>
  30  * The initial implementation is limited to extracting fields used by Jalview
  31  * after fetching an EMBL or EMBLCDS entry:
  32  *
  33  * <pre>
  34  * accession, version, sequence, xref
  35  * and (for CDS feature) location, protein_id, product, codon_start, translation
  36  * </pre>
  37  *
  38  * For a complete parser, it may be best to adopt that provided in
  39  * https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile
  40  * (but note this has a dependency on the Apache Commons library)
  41  *
  42  * @author gmcarstairs
  43  * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt
  44  * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
  45  */
  46 public class EmblFlatFile extends AlignFile // FileParse
  47 {
  48   private static final String QUOTE = "\"";
  49
  50   private static final String DOUBLED_QUOTE = QUOTE + QUOTE;
  51
  52   /**
  53    * A data bean class to hold values parsed from one CDS Feature (FT)
  54    */
  55   class CdsData
  56   {
  57     String translation; // from CDS feature /translation
  58
  59     String cdsLocation; // CDS /location raw value
  60
  61     int codonStart = 1; // from CDS /codon_start
  62
  63     String proteinName; // from CDS /product; used for protein description
  64
  65     String proteinId; // from CDS /protein_id
  66
  67     List<DBRefEntry> xrefs = new ArrayList<>(); // from CDS /db_xref qualifiers
  68
  69     Map<String, String> cdsProps = new Hashtable<>(); // CDS other qualifiers
  70   }
  71
  72   private static final String WHITESPACE = "\\s+";
  73
  74   private String sourceDb;
  75
  76   /*
  77    * values parsed from the EMBL flatfile record
  78    */
  79   private String accession; // from ID (first token)
  80
  81   private String version; // from ID (second token)
  82
  83   private String description; // from (first) DE line
  84
  85   private int length = 128; // from ID (7th token), with usable default
  86
  87   private List<DBRefEntry> dbrefs; // from DR
  88
  89   private String sequenceString; // from SQ lines
  90
  91   /*
  92    * parsed CDS data fields, keyed by protein_id
  93    */
  94   private Map<String, CdsData> cds;
  95
  96   /**
  97    * Constructor
  98    *
  99    * @param fp
 100    * @param sourceId
 101    * @throws IOException
 102    */
 103   public EmblFlatFile(FileParse fp, String sourceId) throws IOException
 104   {
 105     super(false, fp); // don't parse immediately
 106     this.sourceDb = sourceId;
 107     dbrefs = new ArrayList<>();
 108
 109     /*
 110      * using TreeMap gives CDS sequences in alphabetical, so readable, order
 111      */
 112     cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
 113   }
 114
 115   /**
 116    * Parses the flatfile, and if successful, saves as an annotated sequence
 117    * which may be retrieved by calling {@code getSequence()}
 118    *
 119    * @throws IOException
 120    */
 121   public void parse() throws IOException
 122   {
 123     String line = nextLine();
 124     while (line != null)
 125     {
 126       if (line.startsWith("ID"))
 127       {
 128         line = parseID(line);
 129       }
 130       else if (line.startsWith("DE"))
 131       {
 132         line = parseDE(line);
 133       }
 134       else if (line.startsWith("DR"))
 135       {
 136         line = parseDR(line);
 137       }
 138       else if (line.startsWith("SQ"))
 139       {
 140         line = parseSQ();
 141       }
 142       else if (line.startsWith("FT"))
 143       {
 144         line = parseFT(line);
 145       }
 146       else
 147       {
 148         line = nextLine();
 149       }
 150     }
 151     buildSequence();
 152   }
 153
 154   /**
 155    * Extracts and saves the primary accession and version (SV value) from an ID
 156    * line, or null if not found. Returns the next line after the one processed.
 157    *
 158    * @param line
 159    * @throws IOException
 160    */
 161   String parseID(String line) throws IOException
 162   {
 163     String[] tokens = line.substring(2).split(";");
 164
 165     /*
 166      * first is primary accession
 167      */
 168     String token = tokens[0].trim();
 169     if (!token.isEmpty())
 170     {
 171       this.accession = token;
 172     }
 173
 174     /*
 175      * second token is 'SV versionNo'
 176      */
 177     if (tokens.length > 1)
 178     {
 179       token = tokens[1].trim();
 180       if (token.startsWith("SV"))
 181       {
 182         String[] bits = token.trim().split(WHITESPACE);
 183         this.version = bits[bits.length - 1];
 184       }
 185     }
 186
 187     /*
 188      * seventh token is 'length BP'
 189      */
 190     if (tokens.length > 6)
 191     {
 192       token = tokens[6].trim();
 193       String[] bits = token.trim().split(WHITESPACE);
 194       try
 195       {
 196         this.length = Integer.valueOf(bits[0]);
 197       } catch (NumberFormatException e)
 198       {
 199         Cache.log.error("bad length read in flatfile, line: " + line);
 200       }
 201     }
 202
 203     return nextLine();
 204   }
 205
 206   /**
 207    * Reads sequence description from the first DE line found. Any trailing
 208    * period is discarded. If there are multiple DE lines, only the first (short
 209    * description) is read, the rest are ignored.
 210    *
 211    * @param line
 212    * @return
 213    * @throws IOException
 214    */
 215   String parseDE(String line) throws IOException
 216   {
 217     String desc = line.substring(2).trim();
 218     if (desc.endsWith("."))
 219     {
 220       desc = desc.substring(0, desc.length() - 1);
 221     }
 222     this.description = desc;
 223
 224     /*
 225      * pass over any additional DE lines
 226      */
 227     while ((line = nextLine()) != null)
 228     {
 229       if (!line.startsWith("DE"))
 230       {
 231         break;
 232       }
 233     }
 234
 235     return line;
 236   }
 237
 238   /**
 239    * Processes one DR line and saves as a DBRefEntry cross-reference. Returns
 240    * the line following the line processed.
 241    *
 242    * @param line
 243    * @throws IOException
 244    */
 245   String parseDR(String line) throws IOException
 246   {
 247     String[] tokens = line.substring(2).split(";");
 248     if (tokens.length > 1)
 249     {
 250       /*
 251        * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 252        */
 253       String db = tokens[0].trim();
 254       db = DBRefUtils.getCanonicalName(db);
 255       String acc = tokens[1].trim();
 256       if (acc.endsWith("."))
 257       {
 258         acc = acc.substring(0, acc.length() - 1);
 259       }
 260       String version = "0";
 261       if (tokens.length > 2)
 262       {
 263         String secondaryId = tokens[2].trim();
 264         if (!secondaryId.isEmpty())
 265         {
 266           // todo: is this right? secondary id is not a version number
 267           // version = secondaryId;
 268         }
 269       }
 270       this.dbrefs.add(new DBRefEntry(db, version, acc));
 271     }
 272
 273     return nextLine();
 274   }
 275
 276   /**
 277    * Reads and saves the sequence, read from the lines following the SQ line.
 278    * Whitespace and position counters are discarded. Returns the next line
 279    * following the sequence data (the next line that doesn't start with
 280    * whitespace).
 281    *
 282    * @throws IOException
 283    */
 284   String parseSQ() throws IOException
 285   {
 286     StringBuilder sb = new StringBuilder(this.length);
 287     String line = nextLine();
 288     while (line != null && line.startsWith(" "))
 289     {
 290       line = line.trim();
 291       String[] blocks = line.split(WHITESPACE);
 292
 293       /*
 294        * omit the last block (position counter) on each line
 295        */
 296       for (int i = 0; i < blocks.length - 1; i++)
 297       {
 298         sb.append(blocks[i]);
 299       }
 300       line = nextLine();
 301     }
 302     this.sequenceString = sb.toString();
 303
 304     return line;
 305   }
 306
 307   /**
 308    * Processes an FT line. If it declares a feature type of interest (currently,
 309    * only CDS is processed), processes all of the associated lines (feature
 310    * qualifiers), and returns the next line after that, otherwise simply returns
 311    * the next line.
 312    *
 313    * @param line
 314    * @return
 315    * @throws IOException
 316    */
 317   String parseFT(String line) throws IOException
 318   {
 319     String[] tokens = line.split(WHITESPACE);
 320     if (tokens.length < 3 || !"CDS".equals(tokens[1]))
 321     {
 322       return nextLine();
 323     }
 324
 325     CdsData data = new CdsData();
 326     data.cdsLocation = tokens[2];
 327     // TODO location can be over >1 line e.g. EAW51554
 328
 329     line = nextLine();
 330     while (line != null)
 331     {
 332       if (!line.startsWith("FT    ")) // 4 spaces
 333       {
 334         // e.g. start of next feature "FT source..."
 335         break;
 336       }
 337
 338       /*
 339        * extract qualifier, e.g. FT    /protein_id="CAA37824.1"
 340        * - the value may extend over more than one line
 341        * - if the value has enclosing quotes, these are removed
 342        * - escaped double quotes ("") are reduced to a single character
 343        */
 344       int slashPos = line.indexOf('/');
 345       if (slashPos == -1)
 346       {
 347         Cache.log.error("Unexpected EMBL line ignored: " + line);
 348         line = nextLine();
 349         continue;
 350       }
 351       int eqPos = line.indexOf('=', slashPos + 1);
 352       if (eqPos == -1)
 353       {
 354         // can happen, e.g. /ribosomal_slippage
 355         // Cache.log.error("Unexpected EMBL line ignored: " + line);
 356         line = nextLine();
 357         continue;
 358       }
 359       String qualifier = line.substring(slashPos + 1, eqPos);
 360       String value = line.substring(eqPos + 1);
 361       value = removeQuotes(value);
 362       StringBuilder sb = new StringBuilder().append(value);
 363       line = parseFeatureQualifier(sb, qualifier);
 364       String featureValue = sb.toString();
 365
 366       if ("protein_id".equals(qualifier))
 367       {
 368         data.proteinId = featureValue;
 369       }
 370       else if ("codon_start".equals(qualifier))
 371       {
 372         try
 373         {
 374           data.codonStart = Integer.parseInt(featureValue.trim());
 375         } catch (NumberFormatException e)
 376         {
 377           Cache.log.error("Invalid codon_start in XML for " + this.accession
 378                   + ": " + e.getMessage());
 379         }
 380       }
 381       else if ("db_xref".equals(qualifier))
 382       {
 383         String[] parts = featureValue.split(":");
 384         if (parts.length == 2)
 385         {
 386           String db = parts[0].trim();
 387           db = DBRefUtils.getCanonicalName(db);
 388           DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
 389           data.xrefs.add(dbref);
 390         }
 391       }
 392       else if ("product".equals(qualifier))
 393       {
 394         data.proteinName = featureValue;
 395       }
 396       else if ("translation".equals(qualifier))
 397       {
 398         data.translation = featureValue;
 399       }
 400       else if (!"".equals(featureValue))
 401       {
 402         // throw anything else into the additional properties hash
 403         data.cdsProps.put(qualifier, featureValue);
 404       }
 405     }
 406
 407     if (data.proteinId != null)
 408     {
 409       this.cds.put(data.proteinId, data);
 410     }
 411     else
 412     {
 413       Cache.log.error("Ignoring CDS feature with no protein_id for "
 414               + sourceDb + ":" + accession);
 415     }
 416
 417     return line;
 418   }
 419
 420   /**
 421    * Removes leading or trailing double quotes (") unless doubled, and changes
 422    * any 'escaped' (doubled) double quotes to single characters. As per the
 423    * Feature Table specification for Qualifiers, Free Text.
 424    *
 425    * @param value
 426    * @return
 427    */
 428   static String removeQuotes(String value)
 429   {
 430     if (value == null)
 431     {
 432       return null;
 433     }
 434     if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE))
 435     {
 436       value = value.substring(1);
 437     }
 438     if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE))
 439     {
 440       value = value.substring(0, value.length() - 1);
 441     }
 442     value = value.replace(DOUBLED_QUOTE, QUOTE);
 443     return value;
 444   }
 445
 446   /**
 447    * Reads the value of a feature (FT) qualifier from one or more lines of the
 448    * file, and returns the next line after that. Values are appended to the
 449    * string buffer, which should be already primed with the value read from the
 450    * first line for the qualifier (with any leading double quote removed).
 451    * Enclosing double quotes are removed, and escaped (repeated) double quotes
 452    * reduced to one only. For example for
 453    *
 454    * <pre>
 455    * FT      /note="gene_id=hCG28070.3
 456    * FT      ""foobar"" isoform=CRA_b"
 457    * the returned value is
 458    * gene_id=hCG28070.3 "foobar" isoform=CRA_b
 459    * </pre>
 460    *
 461    * Note the side-effect of this method, to advance data reading to the next
 462    * line after the feature qualifier.
 463    *
 464    * @param sb
 465    *          a string buffer primed with the first line of the value
 466    * @param qualifierName
 467    * @return
 468    * @throws IOException
 469    */
 470   String parseFeatureQualifier(StringBuilder sb, String qualifierName)
 471           throws IOException
 472   {
 473     String line;
 474     while ((line = nextLine()) != null)
 475     {
 476       if (!line.startsWith("FT    "))
 477       {
 478         break; // reached next feature or other input line
 479       }
 480       String[] tokens = line.split(WHITESPACE);
 481       if (tokens.length < 2)
 482       {
 483         Cache.log.error("Ignoring bad EMBL line for " + this.accession
 484                 + ": " + line);
 485         break;
 486       }
 487       if (tokens[1].startsWith("/"))
 488       {
 489         break; // next feature qualifier
 490       }
 491
 492       /*
 493        * heuristic rule: most multi-line value (e.g. /product) are text,
 494        * so add a space for word boundary at a new line; not for translation
 495        */
 496       if (!"translation".equals(qualifierName))
 497       {
 498         sb.append(" ");
 499       }
 500
 501       /*
 502        * remove trailing " and unescape doubled ""
 503        */
 504       String data = removeQuotes(tokens[1]);
 505       sb.append(data);
 506     }
 507
 508     return line;
 509   }
 510
 511   /**
 512    * Constructs and saves the sequence from parsed components
 513    */
 514   void buildSequence()
 515   {
 516     if (this.accession == null || this.sequenceString == null)
 517     {
 518       Cache.log.error("Failed to parse data from EMBL");
 519       return;
 520     }
 521
 522     String name = this.accession;
 523     if (this.sourceDb != null)
 524     {
 525       name = this.sourceDb + "|" + name;
 526     }
 527     SequenceI seq = new Sequence(name, this.sequenceString);
 528     seq.setDescription(this.description);
 529
 530     /*
 531      * add a DBRef to itself
 532      */
 533     DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
 534     int[] startEnd = new int[] { 1, seq.getLength() };
 535     selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
 536     seq.addDBRef(selfRef);
 537
 538     for (DBRefEntry dbref : this.dbrefs)
 539     {
 540       seq.addDBRef(dbref);
 541     }
 542
 543     processCDSFeatures(seq);
 544
 545     seq.deriveSequence();
 546
 547     addSequence(seq);
 548   }
 549
 550   /**
 551    * Process the CDS features, including generation of cross-references and
 552    * mappings to the protein products (translation)
 553    *
 554    * @param seq
 555    */
 556   protected void processCDSFeatures(SequenceI seq)
 557   {
 558     /*
 559      * record protein products found to avoid duplication i.e. >1 CDS with
 560      * the same /protein_id [though not sure I can find an example of this]
 561      */
 562     Map<String, SequenceI> proteins = new HashMap<>();
 563     for (CdsData data : cds.values())
 564     {
 565       processCDSFeature(seq, data, proteins);
 566     }
 567   }
 568
 569   /**
 570    * Processes data for one parsed CDS feature to
 571    * <ul>
 572    * <li>create a protein product sequence for the translation</li>
 573    * <li>create a cross-reference to protein with mapping from dna</li>
 574    * <li>add a CDS feature to the sequence for each CDS start-end range</li>
 575    * <li>add any CDS dbrefs to the sequence and to the protein product</li>
 576    * </ul>
 577    *
 578    * @param SequenceI
 579    *          dna
 580    * @param proteins
 581    *          map of protein products so far derived from CDS data
 582    */
 583   void processCDSFeature(SequenceI dna, CdsData data,
 584           Map<String, SequenceI> proteins)
 585   {
 586     /*
 587      * parse location into a list of [start, end, start, end] positions
 588      */
 589     int[] exons = getCdsRanges(this.accession, data.cdsLocation);
 590
 591     MapList maplist = buildMappingToProtein(dna, exons, data);
 592
 593     int exonNumber = 0;
 594
 595     for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
 596     {
 597       int exonStart = exons[xint];
 598       int exonEnd = exons[xint + 1];
 599       int begin = Math.min(exonStart, exonEnd);
 600       int end = Math.max(exonStart, exonEnd);
 601       exonNumber++;
 602       String desc = String.format("Exon %d for protein EMBLCDS:%s",
 603               exonNumber, data.proteinId);
 604
 605       SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
 606               this.sourceDb);
 607       for (Entry<String, String> val : data.cdsProps.entrySet())
 608       {
 609         sf.setValue(val.getKey(), val.getValue());
 610       }
 611
 612       sf.setEnaLocation(data.cdsLocation);
 613       boolean forwardStrand = exonStart <= exonEnd;
 614       sf.setStrand(forwardStrand ? "+" : "-");
 615       sf.setPhase(String.valueOf(data.codonStart - 1));
 616       sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 617       sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
 618
 619       dna.addSequenceFeature(sf);
 620     }
 621
 622     boolean hasUniprotDbref = false;
 623     for (DBRefEntry xref : data.xrefs)
 624     {
 625       dna.addDBRef(xref);
 626       if (xref.getSource().equals(DBRefSource.UNIPROT))
 627       {
 628         /*
 629          * construct (or find) the sequence for (data.protein_id, data.translation)
 630          */
 631         SequenceI protein = buildProteinProduct(dna, xref, data, proteins);
 632         Mapping map = new Mapping(protein, maplist);
 633         map.setMappedFromId(data.proteinId);
 634         xref.setMap(map);
 635
 636         /*
 637          * add DBRefs with mappings from dna to protein and the inverse
 638          */
 639         DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession);
 640         db1.setMap(new Mapping(dna, maplist.getInverse()));
 641         protein.addDBRef(db1);
 642
 643         hasUniprotDbref = true;
 644       }
 645     }
 646
 647     /*
 648      * if we have a product (translation) but no explicit Uniprot dbref
 649      * (example: EMBL M19487 protein_id AAB02592.1)
 650      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 651      */
 652     if (!hasUniprotDbref)
 653     {
 654       SequenceI protein = proteins.get(data.proteinId);
 655       if (protein == null)
 656       {
 657         protein = new Sequence(data.proteinId, data.translation);
 658         protein.setDescription(data.proteinName);
 659         proteins.put(data.proteinId, protein);
 660       }
 661       // assuming CDSPROTEIN sequence version = dna version (?!)
 662       DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 663               this.version, data.proteinId);
 664       protein.addDBRef(db1);
 665
 666       DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 667               DBRefSource.EMBLCDSProduct, this.version, data.proteinId);
 668       Mapping map = new Mapping(protein, maplist);
 669       map.setMappedFromId(data.proteinId);
 670       dnaToEmblProteinRef.setMap(map);
 671       dna.addDBRef(dnaToEmblProteinRef);
 672     }
 673
 674     /*
 675      * comment brought forward from EmblXmlSource, lines 447-451:
 676      * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL
 677      * sequence with the exon  map; if given a dataset reference, search
 678      * dataset for parent EMBL sequence if it exists and set its map;
 679      * make a new feature annotating the coding contig
 680      */
 681   }
 682
 683   /**
 684    * Computes a mapping from CDS positions in DNA sequence to protein product
 685    * positions, with allowance for stop codon or incomplete start codon
 686    *
 687    * @param dna
 688    * @param exons
 689    * @param data
 690    * @return
 691    */
 692   MapList buildMappingToProtein(final SequenceI dna, final int[] exons,
 693           final CdsData data)
 694   {
 695     MapList dnaToProteinMapping = null;
 696     int peptideLength = data.translation.length();
 697
 698     int[] proteinRange = new int[] { 1, peptideLength };
 699     if (exons != null && exons.length > 0)
 700     {
 701       /*
 702        * We were able to parse 'location'; do a final
 703        * product length truncation check
 704        */
 705       int[] cdsRanges = adjustForProteinLength(peptideLength, exons);
 706       dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 707     }
 708     else
 709     {
 710       /*
 711        * workaround until we handle all 'location' formats fully
 712        * e.g. X53828.1:60..1058 or <123..>289
 713        */
 714       Cache.log.error(String.format(
 715               "Implementation Notice: EMBLCDS location '%s'not properly supported yet"
 716                       + " - Making up the CDNA region of (%s:%s)... may be incorrect",
 717               data.cdsLocation, sourceDb, this.accession));
 718
 719       int completeCodonsLength = 1 - data.codonStart + dna.getLength();
 720       int mappedDnaEnd = dna.getEnd();
 721       if (peptideLength * 3 == completeCodonsLength)
 722       {
 723         // this might occur for CDS sequences where no features are marked
 724         Cache.log.warn("Assuming no stop codon at end of cDNA fragment");
 725         mappedDnaEnd = dna.getEnd();
 726       }
 727       else if ((peptideLength + 1) * 3 == completeCodonsLength)
 728       {
 729         Cache.log.warn("Assuming stop codon at end of cDNA fragment");
 730         mappedDnaEnd = dna.getEnd() - 3;
 731       }
 732
 733       if (mappedDnaEnd != -1)
 734       {
 735         int[] cdsRanges = new int[] {
 736             dna.getStart() + (data.codonStart - 1), mappedDnaEnd };
 737         dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 738       }
 739     }
 740
 741     return dnaToProteinMapping;
 742   }
 743
 744   /**
 745    * Constructs a sequence for the protein product for the CDS data (if there is
 746    * one), and dbrefs with mappings from CDS to protein and the reverse
 747    *
 748    * @param dna
 749    * @param xref
 750    * @param data
 751    * @param proteins
 752    * @return
 753    */
 754   SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref,
 755           CdsData data, Map<String, SequenceI> proteins)
 756   {
 757     /*
 758      * check we have some data to work with
 759      */
 760     if (data.proteinId == null || data.translation == null)
 761     {
 762       return null;
 763     }
 764
 765     /*
 766      * Construct the protein sequence (if not already seen)
 767      */
 768     String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId();
 769     SequenceI protein = proteins.get(proteinSeqName);
 770     if (protein == null)
 771     {
 772       protein = new Sequence(proteinSeqName, data.translation, 1,
 773               data.translation.length());
 774       protein.setDescription(data.proteinName != null ? data.proteinName
 775               : "Protein Product from " + sourceDb);
 776       proteins.put(proteinSeqName, protein);
 777     }
 778
 779     return protein;
 780   }
 781
 782   /**
 783    * Returns the CDS location as a single array of [start, end, start, end...]
 784    * positions. If on the reverse strand, these will be in descending order.
 785    *
 786    * @param accession
 787    * @param location
 788    * @return
 789    */
 790   protected int[] getCdsRanges(String accession, String location)
 791   {
 792     if (location == null)
 793     {
 794       return new int[] {};
 795     }
 796
 797     try
 798     {
 799       List<int[]> ranges = DnaUtils.parseLocation(location);
 800       return MappingUtils.listToArray(ranges);
 801     } catch (ParseException e)
 802     {
 803       Cache.log.warn(
 804               String.format("Not parsing inexact CDS location %s in ENA %s",
 805                       location, accession));
 806       return new int[] {};
 807     }
 808   }
 809
 810   /**
 811    * Output (print) is not implemented for EMBL flat file format
 812    */
 813   @Override
 814   public String print(SequenceI[] seqs, boolean jvsuffix)
 815   {
 816     return null;
 817   }
 818
 819   /**
 820    * Truncates (if necessary) the exon intervals to match 3 times the length of
 821    * the protein; also accepts 3 bases longer (for stop codon not included in
 822    * protein)
 823    *
 824    * @param proteinLength
 825    * @param exon
 826    *          an array of [start, end, start, end...] intervals
 827    * @return the same array (if unchanged) or a truncated copy
 828    */
 829   static int[] adjustForProteinLength(int proteinLength, int[] exon)
 830   {
 831     if (proteinLength <= 0 || exon == null)
 832     {
 833       return exon;
 834     }
 835     int expectedCdsLength = proteinLength * 3;
 836     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 837
 838     /*
 839      * if exon length matches protein, or is shorter, or longer by the
 840      * length of a stop codon (3 bases), then leave it unchanged
 841      */
 842     if (expectedCdsLength >= exonLength
 843             || expectedCdsLength == exonLength - 3)
 844     {
 845       return exon;
 846     }
 847
 848     int origxon[];
 849     int sxpos = -1;
 850     int endxon = 0;
 851     origxon = new int[exon.length];
 852     System.arraycopy(exon, 0, origxon, 0, exon.length);
 853     int cdspos = 0;
 854     for (int x = 0; x < exon.length; x += 2)
 855     {
 856       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 857       if (expectedCdsLength <= cdspos)
 858       {
 859         // advanced beyond last codon.
 860         sxpos = x;
 861         if (expectedCdsLength != cdspos)
 862         {
 863           // System.err
 864           // .println("Truncating final exon interval on region by "
 865           // + (cdspos - cdslength));
 866         }
 867
 868         /*
 869          * shrink the final exon - reduce end position if forward
 870          * strand, increase it if reverse
 871          */
 872         if (exon[x + 1] >= exon[x])
 873         {
 874           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 875         }
 876         else
 877         {
 878           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 879         }
 880         break;
 881       }
 882     }
 883
 884     if (sxpos != -1)
 885     {
 886       // and trim the exon interval set if necessary
 887       int[] nxon = new int[sxpos + 2];
 888       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 889       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 890                                 // set
 891       exon = nxon;
 892     }
 893     return exon;
 894   }
 895 }