src/jalview/io/EmblFlatFile.java

   1 package jalview.io;
   2
   3 import java.io.IOException;
   4 import java.text.ParseException;
   5 import java.util.ArrayList;
   6 import java.util.Arrays;
   7 import java.util.HashMap;
   8 import java.util.Hashtable;
   9 import java.util.List;
  10 import java.util.Map;
  11 import java.util.Map.Entry;
  12 import java.util.TreeMap;
  13
  14 import jalview.bin.Cache;
  15 import jalview.datamodel.DBRefEntry;
  16 import jalview.datamodel.DBRefSource;
  17 import jalview.datamodel.FeatureProperties;
  18 import jalview.datamodel.Mapping;
  19 import jalview.datamodel.Sequence;
  20 import jalview.datamodel.SequenceFeature;
  21 import jalview.datamodel.SequenceI;
  22 import jalview.util.DBRefUtils;
  23 import jalview.util.DnaUtils;
  24 import jalview.util.MapList;
  25 import jalview.util.MappingUtils;
  26
  27 /**
  28  * A class that provides selective parsing of the EMBL flatfile format.
  29  * <p>
  30  * The initial implementation is limited to extracting fields used by Jalview
  31  * after fetching an EMBL or EMBLCDS entry:
  32  *
  33  * <pre>
  34  * accession, version, sequence, xref
  35  * and (for CDS feature) location, protein_id, product, codon_start, translation
  36  * </pre>
  37  *
  38  * For a complete parser, it may be best to adopt that provided in
  39  * https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile
  40  * (but note this has a dependency on the Apache Commons library)
  41  *
  42  * @author gmcarstairs
  43  * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt
  44  * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
  45  */
  46 public class EmblFlatFile extends AlignFile // FileParse
  47 {
  48   private static final String QUOTE = "\"";
  49
  50   private static final String DOUBLED_QUOTE = QUOTE + QUOTE;
  51
  52   /**
  53    * when true, interpret the mol_type 'source' feature attribute
  54    * and generate an RNA sequence from the DNA record
  55    */
  56   private boolean produceRna=true;
  57   /**
  58    * A data bean class to hold values parsed from one CDS Feature (FT)
  59    */
  60   class CdsData
  61   {
  62     String translation; // from CDS feature /translation
  63
  64     String cdsLocation; // CDS /location raw value
  65
  66     int codonStart = 1; // from CDS /codon_start
  67
  68     String proteinName; // from CDS /product; used for protein description
  69
  70     String proteinId; // from CDS /protein_id
  71
  72     List<DBRefEntry> xrefs = new ArrayList<>(); // from CDS /db_xref qualifiers
  73
  74     Map<String, String> cdsProps = new Hashtable<>(); // CDS other qualifiers
  75   }
  76
  77   private static final String WHITESPACE = "\\s+";
  78
  79   private String sourceDb;
  80
  81   /*
  82    * values parsed from the EMBL flatfile record
  83    */
  84   private String accession; // from ID (first token)
  85
  86   private String version; // from ID (second token)
  87
  88   private String description; // from (first) DE line
  89
  90   private int length = 128; // from ID (7th token), with usable default
  91
  92   private List<DBRefEntry> dbrefs; // from DR
  93
  94   private boolean sequenceStringIsRNA=false;
  95   private String sequenceString; // from SQ lines
  96
  97   /*
  98    * parsed CDS data fields, keyed by protein_id
  99    */
 100   private Map<String, CdsData> cds;
 101
 102   /**
 103    * Constructor
 104    *
 105    * @param fp
 106    * @param sourceId
 107    * @throws IOException
 108    */
 109   public EmblFlatFile(FileParse fp, String sourceId) throws IOException
 110   {
 111     super(false, fp); // don't parse immediately
 112     this.sourceDb = sourceId;
 113     dbrefs = new ArrayList<>();
 114
 115     /*
 116      * using TreeMap gives CDS sequences in alphabetical, so readable, order
 117      */
 118     cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
 119   }
 120
 121   /**
 122    * Parses the flatfile, and if successful, saves as an annotated sequence
 123    * which may be retrieved by calling {@code getSequence()}
 124    *
 125    * @throws IOException
 126    */
 127   public void parse() throws IOException
 128   {
 129     String line = nextLine();
 130     while (line != null)
 131     {
 132       if (line.startsWith("ID"))
 133       {
 134         line = parseID(line);
 135       }
 136       else if (line.startsWith("DE"))
 137       {
 138         line = parseDE(line);
 139       }
 140       else if (line.startsWith("DR"))
 141       {
 142         line = parseDR(line);
 143       }
 144       else if (line.startsWith("SQ"))
 145       {
 146         line = parseSQ();
 147       }
 148       else if (line.startsWith("FT"))
 149       {
 150         line = parseFT(line);
 151       }
 152       else
 153       {
 154         line = nextLine();
 155       }
 156     }
 157     buildSequence();
 158   }
 159
 160   /**
 161    * Extracts and saves the primary accession and version (SV value) from an ID
 162    * line, or null if not found. Returns the next line after the one processed.
 163    *
 164    * @param line
 165    * @throws IOException
 166    */
 167   String parseID(String line) throws IOException
 168   {
 169     String[] tokens = line.substring(2).split(";");
 170
 171     /*
 172      * first is primary accession
 173      */
 174     String token = tokens[0].trim();
 175     if (!token.isEmpty())
 176     {
 177       this.accession = token;
 178     }
 179
 180     /*
 181      * second token is 'SV versionNo'
 182      */
 183     if (tokens.length > 1)
 184     {
 185       token = tokens[1].trim();
 186       if (token.startsWith("SV"))
 187       {
 188         String[] bits = token.trim().split(WHITESPACE);
 189         this.version = bits[bits.length - 1];
 190       }
 191     }
 192
 193     /*
 194      * seventh token is 'length BP'
 195      */
 196     if (tokens.length > 6)
 197     {
 198       token = tokens[6].trim();
 199       String[] bits = token.trim().split(WHITESPACE);
 200       try
 201       {
 202         this.length = Integer.valueOf(bits[0]);
 203       } catch (NumberFormatException e)
 204       {
 205         Cache.log.error("bad length read in flatfile, line: " + line);
 206       }
 207     }
 208
 209     return nextLine();
 210   }
 211
 212   /**
 213    * Reads sequence description from the first DE line found. Any trailing
 214    * period is discarded. If there are multiple DE lines, only the first (short
 215    * description) is read, the rest are ignored.
 216    *
 217    * @param line
 218    * @return
 219    * @throws IOException
 220    */
 221   String parseDE(String line) throws IOException
 222   {
 223     String desc = line.substring(2).trim();
 224     if (desc.endsWith("."))
 225     {
 226       desc = desc.substring(0, desc.length() - 1);
 227     }
 228     this.description = desc;
 229
 230     /*
 231      * pass over any additional DE lines
 232      */
 233     while ((line = nextLine()) != null)
 234     {
 235       if (!line.startsWith("DE"))
 236       {
 237         break;
 238       }
 239     }
 240
 241     return line;
 242   }
 243
 244   /**
 245    * Processes one DR line and saves as a DBRefEntry cross-reference. Returns
 246    * the line following the line processed.
 247    *
 248    * @param line
 249    * @throws IOException
 250    */
 251   String parseDR(String line) throws IOException
 252   {
 253     String[] tokens = line.substring(2).split(";");
 254     if (tokens.length > 1)
 255     {
 256       /*
 257        * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 258        */
 259       String db = tokens[0].trim();
 260       db = DBRefUtils.getCanonicalName(db);
 261       String acc = tokens[1].trim();
 262       if (acc.endsWith("."))
 263       {
 264         acc = acc.substring(0, acc.length() - 1);
 265       }
 266       String version = "0";
 267       if (tokens.length > 2)
 268       {
 269         String secondaryId = tokens[2].trim();
 270         if (!secondaryId.isEmpty())
 271         {
 272           // todo: is this right? secondary id is not a version number
 273           // version = secondaryId;
 274         }
 275       }
 276       this.dbrefs.add(new DBRefEntry(db, version, acc));
 277     }
 278
 279     return nextLine();
 280   }
 281
 282   /**
 283    * Reads and saves the sequence, read from the lines following the SQ line.
 284    * Whitespace and position counters are discarded. Returns the next line
 285    * following the sequence data (the next line that doesn't start with
 286    * whitespace).
 287    *
 288    * @throws IOException
 289    */
 290   String parseSQ() throws IOException
 291   {
 292     StringBuilder sb = new StringBuilder(this.length);
 293     String line = nextLine();
 294     while (line != null && line.startsWith(" "))
 295     {
 296       line = line.trim();
 297       String[] blocks = line.split(WHITESPACE);
 298
 299       /*
 300        * omit the last block (position counter) on each line
 301        */
 302       for (int i = 0; i < blocks.length - 1; i++)
 303       {
 304         sb.append(blocks[i]);
 305       }
 306       line = nextLine();
 307     }
 308     this.sequenceString = sb.toString();
 309
 310     return line;
 311   }
 312
 313   /**
 314    * Processes an FT line. If it declares a feature type of interest (currently,
 315    * only CDS is processed), processes all of the associated lines (feature
 316    * qualifiers), and returns the next line after that, otherwise simply returns
 317    * the next line.
 318    *
 319    * @param line
 320    * @return
 321    * @throws IOException
 322    */
 323   String parseFT(String line) throws IOException
 324   {
 325     String[] tokens = line.split(WHITESPACE);
 326     if (tokens.length < 3 || (!"CDS".equals(tokens[1]) && !"source".equals(tokens[1])))
 327     {
 328       return nextLine();
 329     }
 330
 331     if (tokens[1].equals("source"))
 332     {
 333       return parseSourceQualifiers(tokens);
 334     }
 335
 336     /*
 337      * parse location - which may be over more than one line e.g. EAW51554
 338      */
 339     CdsData data = new CdsData();
 340     data.cdsLocation = tokens[2];
 341     // TODO location can be over >1 line e.g. EAW51554
 342
 343     line = nextLine();
 344     while (line != null)
 345     {
 346       if (!line.startsWith("FT    ")) // 4 spaces
 347       {
 348         // e.g. start of next feature "FT source..."
 349         break;
 350       }
 351
 352       /*
 353        * extract qualifier, e.g. FT    /protein_id="CAA37824.1"
 354        * - the value may extend over more than one line
 355        * - if the value has enclosing quotes, these are removed
 356        * - escaped double quotes ("") are reduced to a single character
 357        */
 358       int slashPos = line.indexOf('/');
 359       if (slashPos == -1)
 360       {
 361         Cache.log.error("Unexpected EMBL line ignored: " + line);
 362         line = nextLine();
 363         continue;
 364       }
 365       int eqPos = line.indexOf('=', slashPos + 1);
 366       if (eqPos == -1)
 367       {
 368         // can happen, e.g. /ribosomal_slippage
 369         // Cache.log.error("Unexpected EMBL line ignored: " + line);
 370         line = nextLine();
 371         continue;
 372       }
 373       String qualifier = line.substring(slashPos + 1, eqPos);
 374       String value = line.substring(eqPos + 1);
 375       value = removeQuotes(value);
 376       StringBuilder sb = new StringBuilder().append(value);
 377       line = parseFeatureQualifier(sb, qualifier);
 378       String featureValue = sb.toString();
 379
 380       if ("protein_id".equals(qualifier))
 381       {
 382         data.proteinId = featureValue;
 383       }
 384       else if ("codon_start".equals(qualifier))
 385       {
 386         try
 387         {
 388           data.codonStart = Integer.parseInt(featureValue.trim());
 389         } catch (NumberFormatException e)
 390         {
 391           Cache.log.error("Invalid codon_start in XML for " + this.accession
 392                   + ": " + e.getMessage());
 393         }
 394       }
 395       else if ("db_xref".equals(qualifier))
 396       {
 397         String[] parts = featureValue.split(":");
 398         if (parts.length == 2)
 399         {
 400           String db = parts[0].trim();
 401           db = DBRefUtils.getCanonicalName(db);
 402           DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
 403           data.xrefs.add(dbref);
 404         }
 405       }
 406       else if ("product".equals(qualifier))
 407       {
 408         data.proteinName = featureValue;
 409       }
 410       else if ("translation".equals(qualifier))
 411       {
 412         data.translation = featureValue;
 413       }
 414       else if (!"".equals(featureValue))
 415       {
 416         // throw anything else into the additional properties hash
 417         data.cdsProps.put(qualifier, featureValue);
 418       }
 419     }
 420
 421     if (data.proteinId != null)
 422     {
 423       this.cds.put(data.proteinId, data);
 424     }
 425     else
 426     {
 427       Cache.log.error("Ignoring CDS feature with no protein_id for "
 428               + sourceDb + ":" + accession);
 429     }
 430
 431     return line;
 432   }
 433
 434   /**
 435    * process attributes for 'source' until the next FT feature entry
 436    * only interested in 'mol_type'
 437    * @param tokens
 438    * @return
 439    * @throws IOException
 440    */
 441   private String parseSourceQualifiers(String[] tokens) throws IOException
 442   {
 443     if (!"source".equals(tokens[1]))
 444     {
 445       throw (new RuntimeException("Not given a source qualifier"));
 446     }
 447     // search for mol_type attribute
 448
 449     StringBuilder sb = new StringBuilder().append(tokens[2]); // extent of
 450                                                               // sequence
 451
 452     String line = parseFeatureQualifier(sb, "source");
 453     while (line != null)
 454     {
 455       if (!line.startsWith("FT    ")) // four spaces, end of this feature table
 456                                       // entry
 457       {
 458         return line;
 459       }
 460
 461       int p = line.indexOf("\\mol_type");
 462       int qs = line.indexOf("\"", p);
 463       int qe = line.indexOf("\"", qs + 1);
 464       String qualifier=line.substring(qs,qe).toLowerCase();
 465       if (qualifier.indexOf("rna") > -1)
 466       {
 467         sequenceStringIsRNA = true;
 468       }
 469       if (qualifier.indexOf("dna") > -1)
 470       {
 471         sequenceStringIsRNA = false;
 472       }
 473       line=parseFeatureQualifier(sb, "source");
 474     }
 475     return line;
 476   }
 477
 478   /**
 479    * Removes leading or trailing double quotes (") unless doubled, and changes
 480    * any 'escaped' (doubled) double quotes to single characters. As per the
 481    * Feature Table specification for Qualifiers, Free Text.
 482    *
 483    * @param value
 484    * @return
 485    */
 486   static String removeQuotes(String value)
 487   {
 488     if (value == null)
 489     {
 490       return null;
 491     }
 492     if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE))
 493     {
 494       value = value.substring(1);
 495     }
 496     if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE))
 497     {
 498       value = value.substring(0, value.length() - 1);
 499     }
 500     value = value.replace(DOUBLED_QUOTE, QUOTE);
 501     return value;
 502   }
 503
 504   /**
 505    * Reads the value of a feature (FT) qualifier from one or more lines of the
 506    * file, and returns the next line after that. Values are appended to the
 507    * string buffer, which should be already primed with the value read from the
 508    * first line for the qualifier (with any leading double quote removed).
 509    * Enclosing double quotes are removed, and escaped (repeated) double quotes
 510    * reduced to one only. For example for
 511    *
 512    * <pre>
 513    * FT      /note="gene_id=hCG28070.3
 514    * FT      ""foobar"" isoform=CRA_b"
 515    * the returned value is
 516    * gene_id=hCG28070.3 "foobar" isoform=CRA_b
 517    * </pre>
 518    *
 519    * Note the side-effect of this method, to advance data reading to the next
 520    * line after the feature qualifier.
 521    *
 522    * @param sb
 523    *          a string buffer primed with the first line of the value
 524    * @param qualifierName
 525    * @return
 526    * @throws IOException
 527    */
 528   String parseFeatureQualifier(StringBuilder sb, String qualifierName)
 529           throws IOException
 530   {
 531     String line;
 532     while ((line = nextLine()) != null)
 533     {
 534       if (!line.startsWith("FT    "))
 535       {
 536         break; // reached next feature or other input line
 537       }
 538       String[] tokens = line.split(WHITESPACE);
 539       if (tokens.length < 2)
 540       {
 541         Cache.log.error("Ignoring bad EMBL line for " + this.accession
 542                 + ": " + line);
 543         break;
 544       }
 545       if (tokens[1].startsWith("/"))
 546       {
 547         break; // next feature qualifier
 548       }
 549
 550       /*
 551        * heuristic rule: most multi-line value (e.g. /product) are text,
 552        * so add a space for word boundary at a new line; not for translation
 553        */
 554       if (!"translation".equals(qualifierName))
 555       {
 556         sb.append(" ");
 557       }
 558
 559       /*
 560        * remove trailing " and unescape doubled ""
 561        */
 562       String data = removeQuotes(tokens[1]);
 563       sb.append(data);
 564     }
 565
 566     return line;
 567   }
 568
 569   /**
 570    * Constructs and saves the sequence from parsed components
 571    */
 572   void buildSequence()
 573   {
 574     if (this.accession == null || this.sequenceString == null)
 575     {
 576       Cache.log.error("Failed to parse data from EMBL");
 577       return;
 578     }
 579
 580     String name = this.accession;
 581     if (this.sourceDb != null)
 582     {
 583       name = this.sourceDb + "|" + name;
 584     }
 585
 586     if (produceRna && sequenceStringIsRNA)
 587     {
 588       sequenceString = sequenceString.replace('T', 'U').replace('t', 'u');
 589     }
 590
 591     SequenceI seq = new Sequence(name, this.sequenceString);
 592     seq.setDescription(this.description);
 593
 594     /*
 595      * add a DBRef to itself
 596      */
 597     DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
 598     int[] startEnd = new int[] { 1, seq.getLength() };
 599     selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
 600     seq.addDBRef(selfRef);
 601
 602     for (DBRefEntry dbref : this.dbrefs)
 603     {
 604       seq.addDBRef(dbref);
 605     }
 606
 607     processCDSFeatures(seq);
 608
 609     seq.deriveSequence();
 610
 611     addSequence(seq);
 612   }
 613
 614   /**
 615    * Process the CDS features, including generation of cross-references and
 616    * mappings to the protein products (translation)
 617    *
 618    * @param seq
 619    */
 620   protected void processCDSFeatures(SequenceI seq)
 621   {
 622     /*
 623      * record protein products found to avoid duplication i.e. >1 CDS with
 624      * the same /protein_id [though not sure I can find an example of this]
 625      */
 626     Map<String, SequenceI> proteins = new HashMap<>();
 627     for (CdsData data : cds.values())
 628     {
 629       processCDSFeature(seq, data, proteins);
 630     }
 631   }
 632
 633   /**
 634    * Processes data for one parsed CDS feature to
 635    * <ul>
 636    * <li>create a protein product sequence for the translation</li>
 637    * <li>create a cross-reference to protein with mapping from dna</li>
 638    * <li>add a CDS feature to the sequence for each CDS start-end range</li>
 639    * <li>add any CDS dbrefs to the sequence and to the protein product</li>
 640    * </ul>
 641    *
 642    * @param SequenceI
 643    *          dna
 644    * @param proteins
 645    *          map of protein products so far derived from CDS data
 646    */
 647   void processCDSFeature(SequenceI dna, CdsData data,
 648           Map<String, SequenceI> proteins)
 649   {
 650     /*
 651      * parse location into a list of [start, end, start, end] positions
 652      */
 653     int[] exons = getCdsRanges(this.accession, data.cdsLocation);
 654
 655     MapList maplist = buildMappingToProtein(dna, exons, data);
 656
 657     int exonNumber = 0;
 658
 659     for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
 660     {
 661       int exonStart = exons[xint];
 662       int exonEnd = exons[xint + 1];
 663       int begin = Math.min(exonStart, exonEnd);
 664       int end = Math.max(exonStart, exonEnd);
 665       exonNumber++;
 666       String desc = String.format("Exon %d for protein EMBLCDS:%s",
 667               exonNumber, data.proteinId);
 668
 669       SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
 670               this.sourceDb);
 671       for (Entry<String, String> val : data.cdsProps.entrySet())
 672       {
 673         sf.setValue(val.getKey(), val.getValue());
 674       }
 675
 676       sf.setEnaLocation(data.cdsLocation);
 677       boolean forwardStrand = exonStart <= exonEnd;
 678       sf.setStrand(forwardStrand ? "+" : "-");
 679       sf.setPhase(String.valueOf(data.codonStart - 1));
 680       sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 681       sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
 682
 683       dna.addSequenceFeature(sf);
 684     }
 685
 686     boolean hasUniprotDbref = false;
 687     for (DBRefEntry xref : data.xrefs)
 688     {
 689       dna.addDBRef(xref);
 690       if (xref.getSource().equals(DBRefSource.UNIPROT))
 691       {
 692         /*
 693          * construct (or find) the sequence for (data.protein_id, data.translation)
 694          */
 695         SequenceI protein = buildProteinProduct(dna, xref, data, proteins);
 696         Mapping map = new Mapping(protein, maplist);
 697         map.setMappedFromId(data.proteinId);
 698         xref.setMap(map);
 699
 700         /*
 701          * add DBRefs with mappings from dna to protein and the inverse
 702          */
 703         DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession);
 704         db1.setMap(new Mapping(dna, maplist.getInverse()));
 705         protein.addDBRef(db1);
 706
 707         hasUniprotDbref = true;
 708       }
 709     }
 710
 711     /*
 712      * if we have a product (translation) but no explicit Uniprot dbref
 713      * (example: EMBL M19487 protein_id AAB02592.1)
 714      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 715      */
 716     if (!hasUniprotDbref)
 717     {
 718       SequenceI protein = proteins.get(data.proteinId);
 719       if (protein == null)
 720       {
 721         protein = new Sequence(data.proteinId, data.translation);
 722         protein.setDescription(data.proteinName);
 723         proteins.put(data.proteinId, protein);
 724       }
 725       // assuming CDSPROTEIN sequence version = dna version (?!)
 726       DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 727               this.version, data.proteinId);
 728       protein.addDBRef(db1);
 729
 730       DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 731               DBRefSource.EMBLCDSProduct, this.version, data.proteinId);
 732       Mapping map = new Mapping(protein, maplist);
 733       map.setMappedFromId(data.proteinId);
 734       dnaToEmblProteinRef.setMap(map);
 735       dna.addDBRef(dnaToEmblProteinRef);
 736     }
 737
 738     /*
 739      * comment brought forward from EmblXmlSource, lines 447-451:
 740      * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL
 741      * sequence with the exon  map; if given a dataset reference, search
 742      * dataset for parent EMBL sequence if it exists and set its map;
 743      * make a new feature annotating the coding contig
 744      */
 745   }
 746
 747   /**
 748    * Computes a mapping from CDS positions in DNA sequence to protein product
 749    * positions, with allowance for stop codon or incomplete start codon
 750    *
 751    * @param dna
 752    * @param exons
 753    * @param data
 754    * @return
 755    */
 756   MapList buildMappingToProtein(final SequenceI dna, final int[] exons,
 757           final CdsData data)
 758   {
 759     MapList dnaToProteinMapping = null;
 760     int peptideLength = data.translation.length();
 761
 762     int[] proteinRange = new int[] { 1, peptideLength };
 763     if (exons != null && exons.length > 0)
 764     {
 765       /*
 766        * We were able to parse 'location'; do a final
 767        * product length truncation check
 768        */
 769       int[] cdsRanges = adjustForProteinLength(peptideLength, exons);
 770       dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 771     }
 772     else
 773     {
 774       /*
 775        * workaround until we handle all 'location' formats fully
 776        * e.g. X53828.1:60..1058 or <123..>289
 777        */
 778       Cache.log.error(String.format(
 779               "Implementation Notice: EMBLCDS location '%s'not properly supported yet"
 780                       + " - Making up the CDNA region of (%s:%s)... may be incorrect",
 781               data.cdsLocation, sourceDb, this.accession));
 782
 783       int completeCodonsLength = 1 - data.codonStart + dna.getLength();
 784       int mappedDnaEnd = dna.getEnd();
 785       if (peptideLength * 3 == completeCodonsLength)
 786       {
 787         // this might occur for CDS sequences where no features are marked
 788         Cache.log.warn("Assuming no stop codon at end of cDNA fragment");
 789         mappedDnaEnd = dna.getEnd();
 790       }
 791       else if ((peptideLength + 1) * 3 == completeCodonsLength)
 792       {
 793         Cache.log.warn("Assuming stop codon at end of cDNA fragment");
 794         mappedDnaEnd = dna.getEnd() - 3;
 795       }
 796
 797       if (mappedDnaEnd != -1)
 798       {
 799         int[] cdsRanges = new int[] {
 800             dna.getStart() + (data.codonStart - 1), mappedDnaEnd };
 801         dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 802       }
 803     }
 804
 805     return dnaToProteinMapping;
 806   }
 807
 808   /**
 809    * Constructs a sequence for the protein product for the CDS data (if there is
 810    * one), and dbrefs with mappings from CDS to protein and the reverse
 811    *
 812    * @param dna
 813    * @param xref
 814    * @param data
 815    * @param proteins
 816    * @return
 817    */
 818   SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref,
 819           CdsData data, Map<String, SequenceI> proteins)
 820   {
 821     /*
 822      * check we have some data to work with
 823      */
 824     if (data.proteinId == null || data.translation == null)
 825     {
 826       return null;
 827     }
 828
 829     /*
 830      * Construct the protein sequence (if not already seen)
 831      */
 832     String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId();
 833     SequenceI protein = proteins.get(proteinSeqName);
 834     if (protein == null)
 835     {
 836       protein = new Sequence(proteinSeqName, data.translation, 1,
 837               data.translation.length());
 838       protein.setDescription(data.proteinName != null ? data.proteinName
 839               : "Protein Product from " + sourceDb);
 840       proteins.put(proteinSeqName, protein);
 841     }
 842
 843     return protein;
 844   }
 845
 846   /**
 847    * Returns the CDS location as a single array of [start, end, start, end...]
 848    * positions. If on the reverse strand, these will be in descending order.
 849    *
 850    * @param accession
 851    * @param location
 852    * @return
 853    */
 854   protected int[] getCdsRanges(String accession, String location)
 855   {
 856     if (location == null)
 857     {
 858       return new int[] {};
 859     }
 860
 861     try
 862     {
 863       List<int[]> ranges = DnaUtils.parseLocation(location);
 864       return MappingUtils.listToArray(ranges);
 865     } catch (ParseException e)
 866     {
 867       Cache.log.warn(
 868               String.format("Not parsing inexact CDS location %s in ENA %s",
 869                       location, accession));
 870       return new int[] {};
 871     }
 872   }
 873
 874   /**
 875    * Output (print) is not implemented for EMBL flat file format
 876    */
 877   @Override
 878   public String print(SequenceI[] seqs, boolean jvsuffix)
 879   {
 880     return null;
 881   }
 882
 883   /**
 884    * Truncates (if necessary) the exon intervals to match 3 times the length of
 885    * the protein; also accepts 3 bases longer (for stop codon not included in
 886    * protein)
 887    *
 888    * @param proteinLength
 889    * @param exon
 890    *          an array of [start, end, start, end...] intervals
 891    * @return the same array (if unchanged) or a truncated copy
 892    */
 893   static int[] adjustForProteinLength(int proteinLength, int[] exon)
 894   {
 895     if (proteinLength <= 0 || exon == null)
 896     {
 897       return exon;
 898     }
 899     int expectedCdsLength = proteinLength * 3;
 900     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 901
 902     /*
 903      * if exon length matches protein, or is shorter, or longer by the
 904      * length of a stop codon (3 bases), then leave it unchanged
 905      */
 906     if (expectedCdsLength >= exonLength
 907             || expectedCdsLength == exonLength - 3)
 908     {
 909       return exon;
 910     }
 911
 912     int origxon[];
 913     int sxpos = -1;
 914     int endxon = 0;
 915     origxon = new int[exon.length];
 916     System.arraycopy(exon, 0, origxon, 0, exon.length);
 917     int cdspos = 0;
 918     for (int x = 0; x < exon.length; x += 2)
 919     {
 920       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 921       if (expectedCdsLength <= cdspos)
 922       {
 923         // advanced beyond last codon.
 924         sxpos = x;
 925         if (expectedCdsLength != cdspos)
 926         {
 927           // System.err
 928           // .println("Truncating final exon interval on region by "
 929           // + (cdspos - cdslength));
 930         }
 931
 932         /*
 933          * shrink the final exon - reduce end position if forward
 934          * strand, increase it if reverse
 935          */
 936         if (exon[x + 1] >= exon[x])
 937         {
 938           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 939         }
 940         else
 941         {
 942           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 943         }
 944         break;
 945       }
 946     }
 947
 948     if (sxpos != -1)
 949     {
 950       // and trim the exon interval set if necessary
 951       int[] nxon = new int[sxpos + 2];
 952       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 953       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 954                                 // set
 955       exon = nxon;
 956     }
 957     return exon;
 958   }
 959 }