src/jalview/io/EmblFlatFile.java

   1 package jalview.io;
   2
   3 import java.io.IOException;
   4 import java.text.ParseException;
   5 import java.util.ArrayList;
   6 import java.util.Arrays;
   7 import java.util.HashMap;
   8 import java.util.Hashtable;
   9 import java.util.List;
  10 import java.util.Map;
  11 import java.util.Map.Entry;
  12 import java.util.TreeMap;
  13
  14 import jalview.bin.Cache;
  15 import jalview.datamodel.DBRefEntry;
  16 import jalview.datamodel.DBRefSource;
  17 import jalview.datamodel.FeatureProperties;
  18 import jalview.datamodel.Mapping;
  19 import jalview.datamodel.Sequence;
  20 import jalview.datamodel.SequenceFeature;
  21 import jalview.datamodel.SequenceI;
  22 import jalview.util.DBRefUtils;
  23 import jalview.util.DnaUtils;
  24 import jalview.util.MapList;
  25 import jalview.util.MappingUtils;
  26
  27 /**
  28  * A class that provides selective parsing of the EMBL flatfile format.
  29  * <p>
  30  * The initial implementation is limited to extracting fields used by Jalview
  31  * after fetching an EMBL or EMBLCDS entry:
  32  *
  33  * <pre>
  34  * accession, version, sequence, xref
  35  * and (for CDS feature) location, protein_id, product, codon_start, translation
  36  * </pre>
  37  *
  38  * For a complete parser, it may be best to adopt that provided in
  39  * https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile
  40  * (but note this has a dependency on the Apache Commons library)
  41  *
  42  * @author gmcarstairs
  43  * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt
  44  * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
  45  */
  46 public class EmblFlatFile extends AlignFile // FileParse
  47 {
  48   private static final String QUOTE = "\"";
  49
  50   private static final String DOUBLED_QUOTE = QUOTE + QUOTE;
  51
  52   /**
  53    * A data bean class to hold values parsed from one CDS Feature (FT)
  54    */
  55   class CdsData
  56   {
  57     String translation; // from CDS feature /translation
  58
  59     String cdsLocation; // CDS /location raw value
  60
  61     int codonStart = 1; // from CDS /codon_start
  62
  63     String proteinName; // from CDS /product; used for protein description
  64
  65     String proteinId; // from CDS /protein_id
  66
  67     List<DBRefEntry> xrefs = new ArrayList<>(); // from CDS /db_xref qualifiers
  68
  69     Map<String, String> cdsProps = new Hashtable<>(); // CDS other qualifiers
  70   }
  71
  72   private static final String WHITESPACE = "\\s+";
  73
  74   private String sourceDb;
  75
  76   /*
  77    * values parsed from the EMBL flatfile record
  78    */
  79   private String accession; // from ID (first token)
  80
  81   private String version; // from ID (second token)
  82
  83   private String description; // from (first) DE line
  84
  85   private int length = 128; // from ID (7th token), with usable default
  86
  87   private List<DBRefEntry> dbrefs; // from DR
  88
  89   private String sequenceString; // from SQ lines
  90
  91   /*
  92    * parsed CDS data fields, keyed by protein_id
  93    */
  94   private Map<String, CdsData> cds;
  95
  96   /**
  97    * Constructor
  98    *
  99    * @param fp
 100    * @param sourceId
 101    * @throws IOException
 102    */
 103   public EmblFlatFile(FileParse fp, String sourceId) throws IOException
 104   {
 105     super(false, fp); // don't parse immediately
 106     this.sourceDb = sourceId;
 107     dbrefs = new ArrayList<>();
 108
 109     /*
 110      * using TreeMap gives CDS sequences in alphabetical, so readable, order
 111      */
 112     cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
 113   }
 114
 115   /**
 116    * Parses the flatfile, and if successful, saves as an annotated sequence
 117    * which may be retrieved by calling {@code getSequence()}
 118    *
 119    * @throws IOException
 120    */
 121   @Override
 122   public void parse() throws IOException
 123   {
 124     String line = nextLine();
 125     while (line != null)
 126     {
 127       if (line.startsWith("ID"))
 128       {
 129         line = parseID(line);
 130       }
 131       else if (line.startsWith("DE"))
 132       {
 133         line = parseDE(line);
 134       }
 135       else if (line.startsWith("DR"))
 136       {
 137         line = parseDR(line);
 138       }
 139       else if (line.startsWith("SQ"))
 140       {
 141         line = parseSQ();
 142       }
 143       else if (line.startsWith("FT"))
 144       {
 145         line = parseFT(line);
 146       }
 147       else
 148       {
 149         line = nextLine();
 150       }
 151     }
 152     buildSequence();
 153   }
 154
 155   /**
 156    * Extracts and saves the primary accession and version (SV value) from an ID
 157    * line, or null if not found. Returns the next line after the one processed.
 158    *
 159    * @param line
 160    * @throws IOException
 161    */
 162   String parseID(String line) throws IOException
 163   {
 164     String[] tokens = line.substring(2).split(";");
 165
 166     /*
 167      * first is primary accession
 168      */
 169     String token = tokens[0].trim();
 170     if (!token.isEmpty())
 171     {
 172       this.accession = token;
 173     }
 174
 175     /*
 176      * second token is 'SV versionNo'
 177      */
 178     if (tokens.length > 1)
 179     {
 180       token = tokens[1].trim();
 181       if (token.startsWith("SV"))
 182       {
 183         String[] bits = token.trim().split(WHITESPACE);
 184         this.version = bits[bits.length - 1];
 185       }
 186     }
 187
 188     /*
 189      * seventh token is 'length BP'
 190      */
 191     if (tokens.length > 6)
 192     {
 193       token = tokens[6].trim();
 194       String[] bits = token.trim().split(WHITESPACE);
 195       try
 196       {
 197         this.length = Integer.valueOf(bits[0]);
 198       } catch (NumberFormatException e)
 199       {
 200         Cache.log.error("bad length read in flatfile, line: " + line);
 201       }
 202     }
 203
 204     return nextLine();
 205   }
 206
 207   /**
 208    * Reads sequence description from the first DE line found. Any trailing
 209    * period is discarded. If there are multiple DE lines, only the first (short
 210    * description) is read, the rest are ignored.
 211    *
 212    * @param line
 213    * @return
 214    * @throws IOException
 215    */
 216   String parseDE(String line) throws IOException
 217   {
 218     String desc = line.substring(2).trim();
 219     if (desc.endsWith("."))
 220     {
 221       desc = desc.substring(0, desc.length() - 1);
 222     }
 223     this.description = desc;
 224
 225     /*
 226      * pass over any additional DE lines
 227      */
 228     while ((line = nextLine()) != null)
 229     {
 230       if (!line.startsWith("DE"))
 231       {
 232         break;
 233       }
 234     }
 235
 236     return line;
 237   }
 238
 239   /**
 240    * Processes one DR line and saves as a DBRefEntry cross-reference. Returns
 241    * the line following the line processed.
 242    *
 243    * @param line
 244    * @throws IOException
 245    */
 246   String parseDR(String line) throws IOException
 247   {
 248     String[] tokens = line.substring(2).split(";");
 249     if (tokens.length > 1)
 250     {
 251       /*
 252        * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 253        */
 254       String db = tokens[0].trim();
 255       db = DBRefUtils.getCanonicalName(db);
 256       String acc = tokens[1].trim();
 257       if (acc.endsWith("."))
 258       {
 259         acc = acc.substring(0, acc.length() - 1);
 260       }
 261       String version = "0";
 262       if (tokens.length > 2)
 263       {
 264         String secondaryId = tokens[2].trim();
 265         if (!secondaryId.isEmpty())
 266         {
 267           // todo: is this right? secondary id is not a version number
 268           // version = secondaryId;
 269         }
 270       }
 271       this.dbrefs.add(new DBRefEntry(db, version, acc));
 272     }
 273
 274     return nextLine();
 275   }
 276
 277   /**
 278    * Reads and saves the sequence, read from the lines following the SQ line.
 279    * Whitespace and position counters are discarded. Returns the next line
 280    * following the sequence data (the next line that doesn't start with
 281    * whitespace).
 282    *
 283    * @throws IOException
 284    */
 285   String parseSQ() throws IOException
 286   {
 287     StringBuilder sb = new StringBuilder(this.length);
 288     String line = nextLine();
 289     while (line != null && line.startsWith(" "))
 290     {
 291       line = line.trim();
 292       String[] blocks = line.split(WHITESPACE);
 293
 294       /*
 295        * omit the last block (position counter) on each line
 296        */
 297       for (int i = 0; i < blocks.length - 1; i++)
 298       {
 299         sb.append(blocks[i]);
 300       }
 301       line = nextLine();
 302     }
 303     this.sequenceString = sb.toString();
 304
 305     return line;
 306   }
 307
 308   /**
 309    * Processes an FT line. If it declares a feature type of interest (currently,
 310    * only CDS is processed), processes all of the associated lines (feature
 311    * qualifiers), and returns the next line after that, otherwise simply returns
 312    * the next line.
 313    *
 314    * @param line
 315    * @return
 316    * @throws IOException
 317    */
 318   String parseFT(String line) throws IOException
 319   {
 320     String[] tokens = line.split(WHITESPACE);
 321     if (tokens.length < 3 || !"CDS".equals(tokens[1]))
 322     {
 323       return nextLine();
 324     }
 325
 326     /*
 327      * parse location - which may be over more than one line e.g. EAW51554
 328      */
 329     CdsData data = new CdsData();
 330     StringBuilder sb = new StringBuilder().append(tokens[2]);
 331     line = parseFeatureQualifier(sb, "CDS");
 332     data.cdsLocation = sb.toString();
 333
 334     while (line != null)
 335     {
 336       if (!line.startsWith("FT    ")) // 4 spaces
 337       {
 338         // e.g. start of next feature "FT source..."
 339         break;
 340       }
 341
 342       /*
 343        * extract qualifier, e.g. FT    /protein_id="CAA37824.1"
 344        * - the value may extend over more than one line
 345        * - if the value has enclosing quotes, these are removed
 346        * - escaped double quotes ("") are reduced to a single character
 347        */
 348       int slashPos = line.indexOf('/');
 349       if (slashPos == -1)
 350       {
 351         Cache.log.error("Unexpected EMBL line ignored: " + line);
 352         line = nextLine();
 353         continue;
 354       }
 355       int eqPos = line.indexOf('=', slashPos + 1);
 356       if (eqPos == -1)
 357       {
 358         // can happen, e.g. /ribosomal_slippage
 359         // Cache.log.error("Unexpected EMBL line ignored: " + line);
 360         line = nextLine();
 361         continue;
 362       }
 363       String qualifier = line.substring(slashPos + 1, eqPos);
 364       String value = line.substring(eqPos + 1);
 365       value = removeQuotes(value);
 366       sb = new StringBuilder().append(value);
 367       line = parseFeatureQualifier(sb, qualifier);
 368       String featureValue = sb.toString();
 369
 370       if ("protein_id".equals(qualifier))
 371       {
 372         data.proteinId = featureValue;
 373       }
 374       else if ("codon_start".equals(qualifier))
 375       {
 376         try
 377         {
 378           data.codonStart = Integer.parseInt(featureValue.trim());
 379         } catch (NumberFormatException e)
 380         {
 381           Cache.log.error("Invalid codon_start in XML for " + this.accession
 382                   + ": " + e.getMessage());
 383         }
 384       }
 385       else if ("db_xref".equals(qualifier))
 386       {
 387         String[] parts = featureValue.split(":");
 388         if (parts.length == 2)
 389         {
 390           String db = parts[0].trim();
 391           db = DBRefUtils.getCanonicalName(db);
 392           DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
 393           data.xrefs.add(dbref);
 394         }
 395       }
 396       else if ("product".equals(qualifier))
 397       {
 398         data.proteinName = featureValue;
 399       }
 400       else if ("translation".equals(qualifier))
 401       {
 402         data.translation = featureValue;
 403       }
 404       else if (!"".equals(featureValue))
 405       {
 406         // throw anything else into the additional properties hash
 407         data.cdsProps.put(qualifier, featureValue);
 408       }
 409     }
 410
 411     if (data.proteinId != null)
 412     {
 413       this.cds.put(data.proteinId, data);
 414     }
 415     else
 416     {
 417       Cache.log.error("Ignoring CDS feature with no protein_id for "
 418               + sourceDb + ":" + accession);
 419     }
 420
 421     return line;
 422   }
 423
 424   /**
 425    * Removes leading or trailing double quotes (") unless doubled, and changes
 426    * any 'escaped' (doubled) double quotes to single characters. As per the
 427    * Feature Table specification for Qualifiers, Free Text.
 428    *
 429    * @param value
 430    * @return
 431    */
 432   static String removeQuotes(String value)
 433   {
 434     if (value == null)
 435     {
 436       return null;
 437     }
 438     if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE))
 439     {
 440       value = value.substring(1);
 441     }
 442     if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE))
 443     {
 444       value = value.substring(0, value.length() - 1);
 445     }
 446     value = value.replace(DOUBLED_QUOTE, QUOTE);
 447     return value;
 448   }
 449
 450   /**
 451    * Reads the value of a feature (FT) qualifier from one or more lines of the
 452    * file, and returns the next line after that. Values are appended to the
 453    * string buffer, which should be already primed with the value read from the
 454    * first line for the qualifier (with any leading double quote removed).
 455    * Enclosing double quotes are removed, and escaped (repeated) double quotes
 456    * reduced to one only. For example for
 457    *
 458    * <pre>
 459    * FT      /note="gene_id=hCG28070.3
 460    * FT      ""foobar"" isoform=CRA_b"
 461    * the returned value is
 462    * gene_id=hCG28070.3 "foobar" isoform=CRA_b
 463    * </pre>
 464    *
 465    * Note the side-effect of this method, to advance data reading to the next
 466    * line after the feature qualifier.
 467    *
 468    * @param sb
 469    *          a string buffer primed with the first line of the value
 470    * @param qualifierName
 471    * @return
 472    * @throws IOException
 473    */
 474   String parseFeatureQualifier(StringBuilder sb, String qualifierName)
 475           throws IOException
 476   {
 477     String line;
 478     while ((line = nextLine()) != null)
 479     {
 480       if (!line.startsWith("FT    "))
 481       {
 482         break; // reached next feature or other input line
 483       }
 484       String[] tokens = line.split(WHITESPACE);
 485       if (tokens.length < 2)
 486       {
 487         Cache.log.error("Ignoring bad EMBL line for " + this.accession
 488                 + ": " + line);
 489         break;
 490       }
 491       if (tokens[1].startsWith("/"))
 492       {
 493         break; // next feature qualifier
 494       }
 495
 496       /*
 497        * heuristic rule: most multi-line value (e.g. /product) are text,
 498        * so add a space for word boundary at a new line; not for translation
 499        */
 500       if (!"translation".equals(qualifierName)
 501               && !"CDS".equals(qualifierName))
 502       {
 503         sb.append(" ");
 504       }
 505
 506       /*
 507        * remove trailing " and unescape doubled ""
 508        */
 509       String data = removeQuotes(tokens[1]);
 510       sb.append(data);
 511     }
 512
 513     return line;
 514   }
 515
 516   /**
 517    * Constructs and saves the sequence from parsed components
 518    */
 519   void buildSequence()
 520   {
 521     if (this.accession == null || this.sequenceString == null)
 522     {
 523       Cache.log.error("Failed to parse data from EMBL");
 524       return;
 525     }
 526
 527     String name = this.accession;
 528     if (this.sourceDb != null)
 529     {
 530       name = this.sourceDb + "|" + name;
 531     }
 532     SequenceI seq = new Sequence(name, this.sequenceString);
 533     seq.setDescription(this.description);
 534
 535     /*
 536      * add a DBRef to itself
 537      */
 538     DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
 539     int[] startEnd = new int[] { 1, seq.getLength() };
 540     selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
 541     seq.addDBRef(selfRef);
 542
 543     for (DBRefEntry dbref : this.dbrefs)
 544     {
 545       seq.addDBRef(dbref);
 546     }
 547
 548     processCDSFeatures(seq);
 549
 550     seq.deriveSequence();
 551
 552     addSequence(seq);
 553   }
 554
 555   /**
 556    * Process the CDS features, including generation of cross-references and
 557    * mappings to the protein products (translation)
 558    *
 559    * @param seq
 560    */
 561   protected void processCDSFeatures(SequenceI seq)
 562   {
 563     /*
 564      * record protein products found to avoid duplication i.e. >1 CDS with
 565      * the same /protein_id [though not sure I can find an example of this]
 566      */
 567     Map<String, SequenceI> proteins = new HashMap<>();
 568     for (CdsData data : cds.values())
 569     {
 570       processCDSFeature(seq, data, proteins);
 571     }
 572   }
 573
 574   /**
 575    * Processes data for one parsed CDS feature to
 576    * <ul>
 577    * <li>create a protein product sequence for the translation</li>
 578    * <li>create a cross-reference to protein with mapping from dna</li>
 579    * <li>add a CDS feature to the sequence for each CDS start-end range</li>
 580    * <li>add any CDS dbrefs to the sequence and to the protein product</li>
 581    * </ul>
 582    *
 583    * @param SequenceI
 584    *          dna
 585    * @param proteins
 586    *          map of protein products so far derived from CDS data
 587    */
 588   void processCDSFeature(SequenceI dna, CdsData data,
 589           Map<String, SequenceI> proteins)
 590   {
 591     /*
 592      * parse location into a list of [start, end, start, end] positions
 593      */
 594     int[] exons = getCdsRanges(this.accession, data.cdsLocation);
 595
 596     MapList maplist = buildMappingToProtein(dna, exons, data);
 597
 598     int exonNumber = 0;
 599
 600     for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
 601     {
 602       int exonStart = exons[xint];
 603       int exonEnd = exons[xint + 1];
 604       int begin = Math.min(exonStart, exonEnd);
 605       int end = Math.max(exonStart, exonEnd);
 606       exonNumber++;
 607       String desc = String.format("Exon %d for protein EMBLCDS:%s",
 608               exonNumber, data.proteinId);
 609
 610       SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
 611               this.sourceDb);
 612       for (Entry<String, String> val : data.cdsProps.entrySet())
 613       {
 614         sf.setValue(val.getKey(), val.getValue());
 615       }
 616
 617       sf.setEnaLocation(data.cdsLocation);
 618       boolean forwardStrand = exonStart <= exonEnd;
 619       sf.setStrand(forwardStrand ? "+" : "-");
 620       sf.setPhase(String.valueOf(data.codonStart - 1));
 621       sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 622       sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
 623
 624       dna.addSequenceFeature(sf);
 625     }
 626
 627     boolean hasUniprotDbref = false;
 628     for (DBRefEntry xref : data.xrefs)
 629     {
 630       dna.addDBRef(xref);
 631       if (xref.getSource().equals(DBRefSource.UNIPROT))
 632       {
 633         /*
 634          * construct (or find) the sequence for (data.protein_id, data.translation)
 635          */
 636         SequenceI protein = buildProteinProduct(dna, xref, data, proteins);
 637         Mapping map = new Mapping(protein, maplist);
 638         map.setMappedFromId(data.proteinId);
 639         xref.setMap(map);
 640
 641         /*
 642          * add DBRefs with mappings from dna to protein and the inverse
 643          */
 644         DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession);
 645         db1.setMap(new Mapping(dna, maplist.getInverse()));
 646         protein.addDBRef(db1);
 647
 648         hasUniprotDbref = true;
 649       }
 650     }
 651
 652     /*
 653      * if we have a product (translation) but no explicit Uniprot dbref
 654      * (example: EMBL M19487 protein_id AAB02592.1)
 655      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 656      */
 657     if (!hasUniprotDbref)
 658     {
 659       SequenceI protein = proteins.get(data.proteinId);
 660       if (protein == null)
 661       {
 662         protein = new Sequence(data.proteinId, data.translation);
 663         protein.setDescription(data.proteinName);
 664         proteins.put(data.proteinId, protein);
 665       }
 666       // assuming CDSPROTEIN sequence version = dna version (?!)
 667       DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 668               this.version, data.proteinId);
 669       protein.addDBRef(db1);
 670
 671       DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 672               DBRefSource.EMBLCDSProduct, this.version, data.proteinId);
 673       Mapping map = new Mapping(protein, maplist);
 674       map.setMappedFromId(data.proteinId);
 675       dnaToEmblProteinRef.setMap(map);
 676       dna.addDBRef(dnaToEmblProteinRef);
 677     }
 678
 679     /*
 680      * comment brought forward from EmblXmlSource, lines 447-451:
 681      * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL
 682      * sequence with the exon  map; if given a dataset reference, search
 683      * dataset for parent EMBL sequence if it exists and set its map;
 684      * make a new feature annotating the coding contig
 685      */
 686   }
 687
 688   /**
 689    * Computes a mapping from CDS positions in DNA sequence to protein product
 690    * positions, with allowance for stop codon or incomplete start codon
 691    *
 692    * @param dna
 693    * @param exons
 694    * @param data
 695    * @return
 696    */
 697   MapList buildMappingToProtein(final SequenceI dna, final int[] exons,
 698           final CdsData data)
 699   {
 700     MapList dnaToProteinMapping = null;
 701     int peptideLength = data.translation.length();
 702
 703     int[] proteinRange = new int[] { 1, peptideLength };
 704     if (exons != null && exons.length > 0)
 705     {
 706       /*
 707        * We were able to parse 'location'; do a final
 708        * product length truncation check
 709        */
 710       int[] cdsRanges = adjustForProteinLength(peptideLength, exons);
 711       dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 712     }
 713     else
 714     {
 715       /*
 716        * workaround until we handle all 'location' formats fully
 717        * e.g. X53828.1:60..1058 or <123..>289
 718        */
 719       Cache.log.error(String.format(
 720               "Implementation Notice: EMBLCDS location '%s'not properly supported yet"
 721                       + " - Making up the CDNA region of (%s:%s)... may be incorrect",
 722               data.cdsLocation, sourceDb, this.accession));
 723
 724       int completeCodonsLength = 1 - data.codonStart + dna.getLength();
 725       int mappedDnaEnd = dna.getEnd();
 726       if (peptideLength * 3 == completeCodonsLength)
 727       {
 728         // this might occur for CDS sequences where no features are marked
 729         Cache.log.warn("Assuming no stop codon at end of cDNA fragment");
 730         mappedDnaEnd = dna.getEnd();
 731       }
 732       else if ((peptideLength + 1) * 3 == completeCodonsLength)
 733       {
 734         Cache.log.warn("Assuming stop codon at end of cDNA fragment");
 735         mappedDnaEnd = dna.getEnd() - 3;
 736       }
 737
 738       if (mappedDnaEnd != -1)
 739       {
 740         int[] cdsRanges = new int[] {
 741             dna.getStart() + (data.codonStart - 1), mappedDnaEnd };
 742         dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 743       }
 744     }
 745
 746     return dnaToProteinMapping;
 747   }
 748
 749   /**
 750    * Constructs a sequence for the protein product for the CDS data (if there is
 751    * one), and dbrefs with mappings from CDS to protein and the reverse
 752    *
 753    * @param dna
 754    * @param xref
 755    * @param data
 756    * @param proteins
 757    * @return
 758    */
 759   SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref,
 760           CdsData data, Map<String, SequenceI> proteins)
 761   {
 762     /*
 763      * check we have some data to work with
 764      */
 765     if (data.proteinId == null || data.translation == null)
 766     {
 767       return null;
 768     }
 769
 770     /*
 771      * Construct the protein sequence (if not already seen)
 772      */
 773     String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId();
 774     SequenceI protein = proteins.get(proteinSeqName);
 775     if (protein == null)
 776     {
 777       protein = new Sequence(proteinSeqName, data.translation, 1,
 778               data.translation.length());
 779       protein.setDescription(data.proteinName != null ? data.proteinName
 780               : "Protein Product from " + sourceDb);
 781       proteins.put(proteinSeqName, protein);
 782     }
 783
 784     return protein;
 785   }
 786
 787   /**
 788    * Returns the CDS location as a single array of [start, end, start, end...]
 789    * positions. If on the reverse strand, these will be in descending order.
 790    *
 791    * @param accession
 792    * @param location
 793    * @return
 794    */
 795   protected int[] getCdsRanges(String accession, String location)
 796   {
 797     if (location == null)
 798     {
 799       return new int[] {};
 800     }
 801
 802     try
 803     {
 804       List<int[]> ranges = DnaUtils.parseLocation(location);
 805       return MappingUtils.listToArray(ranges);
 806     } catch (ParseException e)
 807     {
 808       Cache.log.warn(
 809               String.format("Not parsing inexact CDS location %s in ENA %s",
 810                       location, accession));
 811       return new int[] {};
 812     }
 813   }
 814
 815   /**
 816    * Output (print) is not implemented for EMBL flat file format
 817    */
 818   @Override
 819   public String print(SequenceI[] seqs, boolean jvsuffix)
 820   {
 821     return null;
 822   }
 823
 824   /**
 825    * Truncates (if necessary) the exon intervals to match 3 times the length of
 826    * the protein; also accepts 3 bases longer (for stop codon not included in
 827    * protein)
 828    *
 829    * @param proteinLength
 830    * @param exon
 831    *          an array of [start, end, start, end...] intervals
 832    * @return the same array (if unchanged) or a truncated copy
 833    */
 834   static int[] adjustForProteinLength(int proteinLength, int[] exon)
 835   {
 836     if (proteinLength <= 0 || exon == null)
 837     {
 838       return exon;
 839     }
 840     int expectedCdsLength = proteinLength * 3;
 841     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 842
 843     /*
 844      * if exon length matches protein, or is shorter, or longer by the
 845      * length of a stop codon (3 bases), then leave it unchanged
 846      */
 847     if (expectedCdsLength >= exonLength
 848             || expectedCdsLength == exonLength - 3)
 849     {
 850       return exon;
 851     }
 852
 853     int origxon[];
 854     int sxpos = -1;
 855     int endxon = 0;
 856     origxon = new int[exon.length];
 857     System.arraycopy(exon, 0, origxon, 0, exon.length);
 858     int cdspos = 0;
 859     for (int x = 0; x < exon.length; x += 2)
 860     {
 861       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 862       if (expectedCdsLength <= cdspos)
 863       {
 864         // advanced beyond last codon.
 865         sxpos = x;
 866         if (expectedCdsLength != cdspos)
 867         {
 868           // System.err
 869           // .println("Truncating final exon interval on region by "
 870           // + (cdspos - cdslength));
 871         }
 872
 873         /*
 874          * shrink the final exon - reduce end position if forward
 875          * strand, increase it if reverse
 876          */
 877         if (exon[x + 1] >= exon[x])
 878         {
 879           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 880         }
 881         else
 882         {
 883           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 884         }
 885         break;
 886       }
 887     }
 888
 889     if (sxpos != -1)
 890     {
 891       // and trim the exon interval set if necessary
 892       int[] nxon = new int[sxpos + 2];
 893       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 894       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 895                                 // set
 896       exon = nxon;
 897     }
 898     return exon;
 899   }
 900 }