src/jalview/io/EmblFlatFile.java

   1 package jalview.io;
   2
   3 import java.io.IOException;
   4 import java.text.ParseException;
   5 import java.util.ArrayList;
   6 import java.util.Arrays;
   7 import java.util.HashMap;
   8 import java.util.Hashtable;
   9 import java.util.List;
  10 import java.util.Map;
  11 import java.util.Map.Entry;
  12 import java.util.TreeMap;
  13
  14 import jalview.bin.Cache;
  15 import jalview.datamodel.DBRefEntry;
  16 import jalview.datamodel.DBRefSource;
  17 import jalview.datamodel.FeatureProperties;
  18 import jalview.datamodel.Mapping;
  19 import jalview.datamodel.Sequence;
  20 import jalview.datamodel.SequenceFeature;
  21 import jalview.datamodel.SequenceI;
  22 import jalview.util.DBRefUtils;
  23 import jalview.util.DnaUtils;
  24 import jalview.util.MapList;
  25 import jalview.util.MappingUtils;
  26
  27 /**
  28  * A class that provides selective parsing of the EMBL flatfile format.
  29  * <p>
  30  * The initial implementation is limited to extracting fields used by Jalview
  31  * after fetching an EMBL or EMBLCDS entry:
  32  *
  33  * <pre>
  34  * accession, version, sequence, xref
  35  * and (for CDS feature) location, protein_id, product, codon_start, translation
  36  * </pre>
  37  *
  38  * For a complete parser, it may be best to adopt that provided in
  39  * https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile
  40  * (but note this has a dependency on the Apache Commons library)
  41  *
  42  * @author gmcarstairs
  43  * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt
  44  * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
  45  */
  46 public class EmblFlatFile extends AlignFile // FileParse
  47 {
  48   private static final String QUOTE = "\"";
  49
  50   private static final String DOUBLED_QUOTE = QUOTE + QUOTE;
  51
  52   /**
  53    * when true, interpret the mol_type 'source' feature attribute and generate
  54    * an RNA sequence from the DNA record
  55    */
  56   private boolean produceRna = true;
  57
  58   /**
  59    * A data bean class to hold values parsed from one CDS Feature (FT)
  60    */
  61   class CdsData
  62   {
  63     String translation; // from CDS feature /translation
  64
  65     String cdsLocation; // CDS /location raw value
  66
  67     int codonStart = 1; // from CDS /codon_start
  68
  69     String proteinName; // from CDS /product; used for protein description
  70
  71     String proteinId; // from CDS /protein_id
  72
  73     List<DBRefEntry> xrefs = new ArrayList<>(); // from CDS /db_xref qualifiers
  74
  75     Map<String, String> cdsProps = new Hashtable<>(); // CDS other qualifiers
  76   }
  77
  78   private static final String WHITESPACE = "\\s+";
  79
  80   private String sourceDb;
  81
  82   /*
  83    * values parsed from the EMBL flatfile record
  84    */
  85   private String accession; // from ID (first token)
  86
  87   private String version; // from ID (second token)
  88
  89   private String description; // from (first) DE line
  90
  91   private int length = 128; // from ID (7th token), with usable default
  92
  93   private List<DBRefEntry> dbrefs; // from DR
  94
  95   private boolean sequenceStringIsRNA = false;
  96
  97   private String sequenceString; // from SQ lines
  98
  99   /*
 100    * parsed CDS data fields, keyed by protein_id
 101    */
 102   private Map<String, CdsData> cds;
 103
 104   /**
 105    * Constructor
 106    *
 107    * @param fp
 108    * @param sourceId
 109    * @throws IOException
 110    */
 111   public EmblFlatFile(FileParse fp, String sourceId) throws IOException
 112   {
 113     super(false, fp); // don't parse immediately
 114     this.sourceDb = sourceId;
 115     dbrefs = new ArrayList<>();
 116
 117     /*
 118      * using TreeMap gives CDS sequences in alphabetical, so readable, order
 119      */
 120     cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
 121   }
 122
 123   /**
 124    * Parses the flatfile, and if successful, saves as an annotated sequence
 125    * which may be retrieved by calling {@code getSequence()}
 126    *
 127    * @throws IOException
 128    */
 129   public void parse() throws IOException
 130   {
 131     String line = nextLine();
 132     while (line != null)
 133     {
 134       if (line.startsWith("ID"))
 135       {
 136         line = parseID(line);
 137       }
 138       else if (line.startsWith("DE"))
 139       {
 140         line = parseDE(line);
 141       }
 142       else if (line.startsWith("DR"))
 143       {
 144         line = parseDR(line);
 145       }
 146       else if (line.startsWith("SQ"))
 147       {
 148         line = parseSQ();
 149       }
 150       else if (line.startsWith("FT"))
 151       {
 152         line = parseFT(line);
 153       }
 154       else
 155       {
 156         line = nextLine();
 157       }
 158     }
 159     buildSequence();
 160   }
 161
 162   /**
 163    * Extracts and saves the primary accession and version (SV value) from an ID
 164    * line, or null if not found. Returns the next line after the one processed.
 165    *
 166    * @param line
 167    * @throws IOException
 168    */
 169   String parseID(String line) throws IOException
 170   {
 171     String[] tokens = line.substring(2).split(";");
 172
 173     /*
 174      * first is primary accession
 175      */
 176     String token = tokens[0].trim();
 177     if (!token.isEmpty())
 178     {
 179       this.accession = token;
 180     }
 181
 182     /*
 183      * second token is 'SV versionNo'
 184      */
 185     if (tokens.length > 1)
 186     {
 187       token = tokens[1].trim();
 188       if (token.startsWith("SV"))
 189       {
 190         String[] bits = token.trim().split(WHITESPACE);
 191         this.version = bits[bits.length - 1];
 192       }
 193     }
 194
 195     /*
 196      * seventh token is 'length BP'
 197      */
 198     if (tokens.length > 6)
 199     {
 200       token = tokens[6].trim();
 201       String[] bits = token.trim().split(WHITESPACE);
 202       try
 203       {
 204         this.length = Integer.valueOf(bits[0]);
 205       } catch (NumberFormatException e)
 206       {
 207         Cache.log.error("bad length read in flatfile, line: " + line);
 208       }
 209     }
 210
 211     return nextLine();
 212   }
 213
 214   /**
 215    * Reads sequence description from the first DE line found. Any trailing
 216    * period is discarded. If there are multiple DE lines, only the first (short
 217    * description) is read, the rest are ignored.
 218    *
 219    * @param line
 220    * @return
 221    * @throws IOException
 222    */
 223   String parseDE(String line) throws IOException
 224   {
 225     String desc = line.substring(2).trim();
 226     if (desc.endsWith("."))
 227     {
 228       desc = desc.substring(0, desc.length() - 1);
 229     }
 230     this.description = desc;
 231
 232     /*
 233      * pass over any additional DE lines
 234      */
 235     while ((line = nextLine()) != null)
 236     {
 237       if (!line.startsWith("DE"))
 238       {
 239         break;
 240       }
 241     }
 242
 243     return line;
 244   }
 245
 246   /**
 247    * Processes one DR line and saves as a DBRefEntry cross-reference. Returns
 248    * the line following the line processed.
 249    *
 250    * @param line
 251    * @throws IOException
 252    */
 253   String parseDR(String line) throws IOException
 254   {
 255     String[] tokens = line.substring(2).split(";");
 256     if (tokens.length > 1)
 257     {
 258       /*
 259        * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 260        */
 261       String db = tokens[0].trim();
 262       db = DBRefUtils.getCanonicalName(db);
 263       String acc = tokens[1].trim();
 264       if (acc.endsWith("."))
 265       {
 266         acc = acc.substring(0, acc.length() - 1);
 267       }
 268       String version = "0";
 269       if (tokens.length > 2)
 270       {
 271         String secondaryId = tokens[2].trim();
 272         if (!secondaryId.isEmpty())
 273         {
 274           // todo: is this right? secondary id is not a version number
 275           // version = secondaryId;
 276         }
 277       }
 278       this.dbrefs.add(new DBRefEntry(db, version, acc));
 279     }
 280
 281     return nextLine();
 282   }
 283
 284   /**
 285    * Reads and saves the sequence, read from the lines following the SQ line.
 286    * Whitespace and position counters are discarded. Returns the next line
 287    * following the sequence data (the next line that doesn't start with
 288    * whitespace).
 289    *
 290    * @throws IOException
 291    */
 292   String parseSQ() throws IOException
 293   {
 294     StringBuilder sb = new StringBuilder(this.length);
 295     String line = nextLine();
 296     while (line != null && line.startsWith(" "))
 297     {
 298       line = line.trim();
 299       String[] blocks = line.split(WHITESPACE);
 300
 301       /*
 302        * omit the last block (position counter) on each line
 303        */
 304       for (int i = 0; i < blocks.length - 1; i++)
 305       {
 306         sb.append(blocks[i]);
 307       }
 308       line = nextLine();
 309     }
 310     this.sequenceString = sb.toString();
 311
 312     return line;
 313   }
 314
 315   /**
 316    * Processes an FT line. If it declares a feature type of interest (currently,
 317    * only CDS is processed), processes all of the associated lines (feature
 318    * qualifiers), and returns the next line after that, otherwise simply returns
 319    * the next line.
 320    *
 321    * @param line
 322    * @return
 323    * @throws IOException
 324    */
 325   String parseFT(String line) throws IOException
 326   {
 327     String[] tokens = line.split(WHITESPACE);
 328     if (tokens.length < 3
 329             || (!"CDS".equals(tokens[1]) && !"source".equals(tokens[1])))
 330     {
 331       return nextLine();
 332     }
 333
 334     if (tokens[1].equals("source"))
 335     {
 336       return parseSourceQualifiers(tokens);
 337     }
 338
 339     /*
 340      * parse location - which may be over more than one line e.g. EAW51554
 341      */
 342     CdsData data = new CdsData();
 343     data.cdsLocation = tokens[2];
 344     // TODO location can be over >1 line e.g. EAW51554
 345
 346     line = nextLine();
 347     while (line != null)
 348     {
 349       if (!line.startsWith("FT    ")) // 4 spaces
 350       {
 351         // e.g. start of next feature "FT source..."
 352         break;
 353       }
 354
 355       /*
 356        * extract qualifier, e.g. FT    /protein_id="CAA37824.1"
 357        * - the value may extend over more than one line
 358        * - if the value has enclosing quotes, these are removed
 359        * - escaped double quotes ("") are reduced to a single character
 360        */
 361       int slashPos = line.indexOf('/');
 362       if (slashPos == -1)
 363       {
 364         Cache.log.error("Unexpected EMBL line ignored: " + line);
 365         line = nextLine();
 366         continue;
 367       }
 368       int eqPos = line.indexOf('=', slashPos + 1);
 369       if (eqPos == -1)
 370       {
 371         // can happen, e.g. /ribosomal_slippage
 372         // Cache.log.error("Unexpected EMBL line ignored: " + line);
 373         line = nextLine();
 374         continue;
 375       }
 376       String qualifier = line.substring(slashPos + 1, eqPos);
 377       String value = line.substring(eqPos + 1);
 378       value = removeQuotes(value);
 379       StringBuilder sb = new StringBuilder().append(value);
 380       line = parseFeatureQualifier(sb, qualifier);
 381       String featureValue = sb.toString();
 382
 383       if ("protein_id".equals(qualifier))
 384       {
 385         data.proteinId = featureValue;
 386       }
 387       else if ("codon_start".equals(qualifier))
 388       {
 389         try
 390         {
 391           data.codonStart = Integer.parseInt(featureValue.trim());
 392         } catch (NumberFormatException e)
 393         {
 394           Cache.log.error("Invalid codon_start in XML for " + this.accession
 395                   + ": " + e.getMessage());
 396         }
 397       }
 398       else if ("db_xref".equals(qualifier))
 399       {
 400         String[] parts = featureValue.split(":");
 401         if (parts.length == 2)
 402         {
 403           String db = parts[0].trim();
 404           db = DBRefUtils.getCanonicalName(db);
 405           DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
 406           data.xrefs.add(dbref);
 407         }
 408       }
 409       else if ("product".equals(qualifier))
 410       {
 411         data.proteinName = featureValue;
 412       }
 413       else if ("translation".equals(qualifier))
 414       {
 415         data.translation = featureValue;
 416       }
 417       else if (!"".equals(featureValue))
 418       {
 419         // throw anything else into the additional properties hash
 420         data.cdsProps.put(qualifier, featureValue);
 421       }
 422     }
 423
 424     if (data.proteinId != null)
 425     {
 426       this.cds.put(data.proteinId, data);
 427     }
 428     else
 429     {
 430       Cache.log.error("Ignoring CDS feature with no protein_id for "
 431               + sourceDb + ":" + accession);
 432     }
 433
 434     return line;
 435   }
 436
 437   /**
 438    * process attributes for 'source' until the next FT feature entry only
 439    * interested in 'mol_type'
 440    *
 441    * @param tokens
 442    * @return
 443    * @throws IOException
 444    */
 445   private String parseSourceQualifiers(String[] tokens) throws IOException
 446   {
 447     if (!"source".equals(tokens[1]))
 448     {
 449       throw (new RuntimeException("Not given a source qualifier"));
 450     }
 451     // search for mol_type attribute
 452
 453     StringBuilder sb = new StringBuilder().append(tokens[2]); // extent of
 454                                                               // sequence
 455
 456     String line = parseFeatureQualifier(sb, "source");
 457     while (line != null)
 458     {
 459       if (!line.startsWith("FT    ")) // four spaces, end of this feature table
 460                                       // entry
 461       {
 462         return line;
 463       }
 464
 465       int p = line.indexOf("\\mol_type");
 466       int qs = line.indexOf("\"", p);
 467       int qe = line.indexOf("\"", qs + 1);
 468       String qualifier = line.substring(qs, qe).toLowerCase();
 469       if (qualifier.indexOf("rna") > -1)
 470       {
 471         sequenceStringIsRNA = true;
 472       }
 473       if (qualifier.indexOf("dna") > -1)
 474       {
 475         sequenceStringIsRNA = false;
 476       }
 477       line = parseFeatureQualifier(sb, "source");
 478     }
 479     return line;
 480   }
 481
 482   /**
 483    * Removes leading or trailing double quotes (") unless doubled, and changes
 484    * any 'escaped' (doubled) double quotes to single characters. As per the
 485    * Feature Table specification for Qualifiers, Free Text.
 486    *
 487    * @param value
 488    * @return
 489    */
 490   static String removeQuotes(String value)
 491   {
 492     if (value == null)
 493     {
 494       return null;
 495     }
 496     if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE))
 497     {
 498       value = value.substring(1);
 499     }
 500     if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE))
 501     {
 502       value = value.substring(0, value.length() - 1);
 503     }
 504     value = value.replace(DOUBLED_QUOTE, QUOTE);
 505     return value;
 506   }
 507
 508   /**
 509    * Reads the value of a feature (FT) qualifier from one or more lines of the
 510    * file, and returns the next line after that. Values are appended to the
 511    * string buffer, which should be already primed with the value read from the
 512    * first line for the qualifier (with any leading double quote removed).
 513    * Enclosing double quotes are removed, and escaped (repeated) double quotes
 514    * reduced to one only. For example for
 515    *
 516    * <pre>
 517    * FT      /note="gene_id=hCG28070.3
 518    * FT      ""foobar"" isoform=CRA_b"
 519    * the returned value is
 520    * gene_id=hCG28070.3 "foobar" isoform=CRA_b
 521    * </pre>
 522    *
 523    * Note the side-effect of this method, to advance data reading to the next
 524    * line after the feature qualifier.
 525    *
 526    * @param sb
 527    *          a string buffer primed with the first line of the value
 528    * @param qualifierName
 529    * @return
 530    * @throws IOException
 531    */
 532   String parseFeatureQualifier(StringBuilder sb, String qualifierName)
 533           throws IOException
 534   {
 535     String line;
 536     while ((line = nextLine()) != null)
 537     {
 538       if (!line.startsWith("FT    "))
 539       {
 540         break; // reached next feature or other input line
 541       }
 542       String[] tokens = line.split(WHITESPACE);
 543       if (tokens.length < 2)
 544       {
 545         Cache.log.error("Ignoring bad EMBL line for " + this.accession
 546                 + ": " + line);
 547         break;
 548       }
 549       if (tokens[1].startsWith("/"))
 550       {
 551         break; // next feature qualifier
 552       }
 553
 554       /*
 555        * heuristic rule: most multi-line value (e.g. /product) are text,
 556        * so add a space for word boundary at a new line; not for translation
 557        */
 558       if (!"translation".equals(qualifierName))
 559       {
 560         sb.append(" ");
 561       }
 562
 563       /*
 564        * remove trailing " and unescape doubled ""
 565        */
 566       String data = removeQuotes(tokens[1]);
 567       sb.append(data);
 568     }
 569
 570     return line;
 571   }
 572
 573   /**
 574    * Constructs and saves the sequence from parsed components
 575    */
 576   void buildSequence()
 577   {
 578     if (this.accession == null || this.sequenceString == null)
 579     {
 580       Cache.log.error("Failed to parse data from EMBL");
 581       return;
 582     }
 583
 584     String name = this.accession;
 585     if (this.sourceDb != null)
 586     {
 587       name = this.sourceDb + "|" + name;
 588     }
 589
 590     if (produceRna && sequenceStringIsRNA)
 591     {
 592       sequenceString = sequenceString.replace('T', 'U').replace('t', 'u');
 593     }
 594
 595     SequenceI seq = new Sequence(name, this.sequenceString);
 596     seq.setDescription(this.description);
 597
 598     /*
 599      * add a DBRef to itself
 600      */
 601     DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
 602     int[] startEnd = new int[] { 1, seq.getLength() };
 603     selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
 604     seq.addDBRef(selfRef);
 605
 606     for (DBRefEntry dbref : this.dbrefs)
 607     {
 608       seq.addDBRef(dbref);
 609     }
 610
 611     processCDSFeatures(seq);
 612
 613     seq.deriveSequence();
 614
 615     addSequence(seq);
 616   }
 617
 618   /**
 619    * Process the CDS features, including generation of cross-references and
 620    * mappings to the protein products (translation)
 621    *
 622    * @param seq
 623    */
 624   protected void processCDSFeatures(SequenceI seq)
 625   {
 626     /*
 627      * record protein products found to avoid duplication i.e. >1 CDS with
 628      * the same /protein_id [though not sure I can find an example of this]
 629      */
 630     Map<String, SequenceI> proteins = new HashMap<>();
 631     for (CdsData data : cds.values())
 632     {
 633       processCDSFeature(seq, data, proteins);
 634     }
 635   }
 636
 637   /**
 638    * Processes data for one parsed CDS feature to
 639    * <ul>
 640    * <li>create a protein product sequence for the translation</li>
 641    * <li>create a cross-reference to protein with mapping from dna</li>
 642    * <li>add a CDS feature to the sequence for each CDS start-end range</li>
 643    * <li>add any CDS dbrefs to the sequence and to the protein product</li>
 644    * </ul>
 645    *
 646    * @param SequenceI
 647    *          dna
 648    * @param proteins
 649    *          map of protein products so far derived from CDS data
 650    */
 651   void processCDSFeature(SequenceI dna, CdsData data,
 652           Map<String, SequenceI> proteins)
 653   {
 654     /*
 655      * parse location into a list of [start, end, start, end] positions
 656      */
 657     int[] exons = getCdsRanges(this.accession, data.cdsLocation);
 658
 659     MapList maplist = buildMappingToProtein(dna, exons, data);
 660
 661     int exonNumber = 0;
 662
 663     for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
 664     {
 665       int exonStart = exons[xint];
 666       int exonEnd = exons[xint + 1];
 667       int begin = Math.min(exonStart, exonEnd);
 668       int end = Math.max(exonStart, exonEnd);
 669       exonNumber++;
 670       String desc = String.format("Exon %d for protein EMBLCDS:%s",
 671               exonNumber, data.proteinId);
 672
 673       SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
 674               this.sourceDb);
 675       for (Entry<String, String> val : data.cdsProps.entrySet())
 676       {
 677         sf.setValue(val.getKey(), val.getValue());
 678       }
 679
 680       sf.setEnaLocation(data.cdsLocation);
 681       boolean forwardStrand = exonStart <= exonEnd;
 682       sf.setStrand(forwardStrand ? "+" : "-");
 683       sf.setPhase(String.valueOf(data.codonStart - 1));
 684       sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 685       sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
 686
 687       dna.addSequenceFeature(sf);
 688     }
 689
 690     boolean hasUniprotDbref = false;
 691     for (DBRefEntry xref : data.xrefs)
 692     {
 693       dna.addDBRef(xref);
 694       if (xref.getSource().equals(DBRefSource.UNIPROT))
 695       {
 696         /*
 697          * construct (or find) the sequence for (data.protein_id, data.translation)
 698          */
 699         SequenceI protein = buildProteinProduct(dna, xref, data, proteins);
 700         Mapping map = new Mapping(protein, maplist);
 701         map.setMappedFromId(data.proteinId);
 702         xref.setMap(map);
 703
 704         /*
 705          * add DBRefs with mappings from dna to protein and the inverse
 706          */
 707         DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession);
 708         db1.setMap(new Mapping(dna, maplist.getInverse()));
 709         protein.addDBRef(db1);
 710
 711         hasUniprotDbref = true;
 712       }
 713     }
 714
 715     /*
 716      * if we have a product (translation) but no explicit Uniprot dbref
 717      * (example: EMBL M19487 protein_id AAB02592.1)
 718      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 719      */
 720     if (!hasUniprotDbref)
 721     {
 722       SequenceI protein = proteins.get(data.proteinId);
 723       if (protein == null)
 724       {
 725         protein = new Sequence(data.proteinId, data.translation);
 726         protein.setDescription(data.proteinName);
 727         proteins.put(data.proteinId, protein);
 728       }
 729       // assuming CDSPROTEIN sequence version = dna version (?!)
 730       DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 731               this.version, data.proteinId);
 732       protein.addDBRef(db1);
 733
 734       DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 735               DBRefSource.EMBLCDSProduct, this.version, data.proteinId);
 736       Mapping map = new Mapping(protein, maplist);
 737       map.setMappedFromId(data.proteinId);
 738       dnaToEmblProteinRef.setMap(map);
 739       dna.addDBRef(dnaToEmblProteinRef);
 740     }
 741
 742     /*
 743      * comment brought forward from EmblXmlSource, lines 447-451:
 744      * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL
 745      * sequence with the exon  map; if given a dataset reference, search
 746      * dataset for parent EMBL sequence if it exists and set its map;
 747      * make a new feature annotating the coding contig
 748      */
 749   }
 750
 751   /**
 752    * Computes a mapping from CDS positions in DNA sequence to protein product
 753    * positions, with allowance for stop codon or incomplete start codon
 754    *
 755    * @param dna
 756    * @param exons
 757    * @param data
 758    * @return
 759    */
 760   MapList buildMappingToProtein(final SequenceI dna, final int[] exons,
 761           final CdsData data)
 762   {
 763     MapList dnaToProteinMapping = null;
 764     int peptideLength = data.translation.length();
 765
 766     int[] proteinRange = new int[] { 1, peptideLength };
 767     if (exons != null && exons.length > 0)
 768     {
 769       /*
 770        * We were able to parse 'location'; do a final
 771        * product length truncation check
 772        */
 773       int[] cdsRanges = adjustForProteinLength(peptideLength, exons);
 774       dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 775     }
 776     else
 777     {
 778       /*
 779        * workaround until we handle all 'location' formats fully
 780        * e.g. X53828.1:60..1058 or <123..>289
 781        */
 782       Cache.log.error(String.format(
 783               "Implementation Notice: EMBLCDS location '%s'not properly supported yet"
 784                       + " - Making up the CDNA region of (%s:%s)... may be incorrect",
 785               data.cdsLocation, sourceDb, this.accession));
 786
 787       int completeCodonsLength = 1 - data.codonStart + dna.getLength();
 788       int mappedDnaEnd = dna.getEnd();
 789       if (peptideLength * 3 == completeCodonsLength)
 790       {
 791         // this might occur for CDS sequences where no features are marked
 792         Cache.log.warn("Assuming no stop codon at end of cDNA fragment");
 793         mappedDnaEnd = dna.getEnd();
 794       }
 795       else if ((peptideLength + 1) * 3 == completeCodonsLength)
 796       {
 797         Cache.log.warn("Assuming stop codon at end of cDNA fragment");
 798         mappedDnaEnd = dna.getEnd() - 3;
 799       }
 800
 801       if (mappedDnaEnd != -1)
 802       {
 803         int[] cdsRanges = new int[] {
 804             dna.getStart() + (data.codonStart - 1), mappedDnaEnd };
 805         dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 806       }
 807     }
 808
 809     return dnaToProteinMapping;
 810   }
 811
 812   /**
 813    * Constructs a sequence for the protein product for the CDS data (if there is
 814    * one), and dbrefs with mappings from CDS to protein and the reverse
 815    *
 816    * @param dna
 817    * @param xref
 818    * @param data
 819    * @param proteins
 820    * @return
 821    */
 822   SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref,
 823           CdsData data, Map<String, SequenceI> proteins)
 824   {
 825     /*
 826      * check we have some data to work with
 827      */
 828     if (data.proteinId == null || data.translation == null)
 829     {
 830       return null;
 831     }
 832
 833     /*
 834      * Construct the protein sequence (if not already seen)
 835      */
 836     String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId();
 837     SequenceI protein = proteins.get(proteinSeqName);
 838     if (protein == null)
 839     {
 840       protein = new Sequence(proteinSeqName, data.translation, 1,
 841               data.translation.length());
 842       protein.setDescription(data.proteinName != null ? data.proteinName
 843               : "Protein Product from " + sourceDb);
 844       proteins.put(proteinSeqName, protein);
 845     }
 846
 847     return protein;
 848   }
 849
 850   /**
 851    * Returns the CDS location as a single array of [start, end, start, end...]
 852    * positions. If on the reverse strand, these will be in descending order.
 853    *
 854    * @param accession
 855    * @param location
 856    * @return
 857    */
 858   protected int[] getCdsRanges(String accession, String location)
 859   {
 860     if (location == null)
 861     {
 862       return new int[] {};
 863     }
 864
 865     try
 866     {
 867       List<int[]> ranges = DnaUtils.parseLocation(location);
 868       return MappingUtils.rangeListToArray(ranges);
 869     } catch (ParseException e)
 870     {
 871       Cache.log.warn(
 872               String.format("Not parsing inexact CDS location %s in ENA %s",
 873                       location, accession));
 874       return new int[] {};
 875     }
 876   }
 877
 878   /**
 879    * Output (print) is not implemented for EMBL flat file format
 880    */
 881   @Override
 882   public String print(SequenceI[] seqs, boolean jvsuffix)
 883   {
 884     return null;
 885   }
 886
 887   /**
 888    * Truncates (if necessary) the exon intervals to match 3 times the length of
 889    * the protein; also accepts 3 bases longer (for stop codon not included in
 890    * protein)
 891    *
 892    * @param proteinLength
 893    * @param exon
 894    *          an array of [start, end, start, end...] intervals
 895    * @return the same array (if unchanged) or a truncated copy
 896    */
 897   static int[] adjustForProteinLength(int proteinLength, int[] exon)
 898   {
 899     if (proteinLength <= 0 || exon == null)
 900     {
 901       return exon;
 902     }
 903     int expectedCdsLength = proteinLength * 3;
 904     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 905
 906     /*
 907      * if exon length matches protein, or is shorter, or longer by the
 908      * length of a stop codon (3 bases), then leave it unchanged
 909      */
 910     if (expectedCdsLength >= exonLength
 911             || expectedCdsLength == exonLength - 3)
 912     {
 913       return exon;
 914     }
 915
 916     int origxon[];
 917     int sxpos = -1;
 918     int endxon = 0;
 919     origxon = new int[exon.length];
 920     System.arraycopy(exon, 0, origxon, 0, exon.length);
 921     int cdspos = 0;
 922     for (int x = 0; x < exon.length; x += 2)
 923     {
 924       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 925       if (expectedCdsLength <= cdspos)
 926       {
 927         // advanced beyond last codon.
 928         sxpos = x;
 929         if (expectedCdsLength != cdspos)
 930         {
 931           // System.err
 932           // .println("Truncating final exon interval on region by "
 933           // + (cdspos - cdslength));
 934         }
 935
 936         /*
 937          * shrink the final exon - reduce end position if forward
 938          * strand, increase it if reverse
 939          */
 940         if (exon[x + 1] >= exon[x])
 941         {
 942           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 943         }
 944         else
 945         {
 946           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 947         }
 948         break;
 949       }
 950     }
 951
 952     if (sxpos != -1)
 953     {
 954       // and trim the exon interval set if necessary
 955       int[] nxon = new int[sxpos + 2];
 956       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 957       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 958                                 // set
 959       exon = nxon;
 960     }
 961     return exon;
 962   }
 963 }