src/jalview/io/FlatFile.java

   1 package jalview.io;
   2
   3 import java.io.IOException;
   4 import java.text.ParseException;
   5 import java.util.ArrayList;
   6 import java.util.Arrays;
   7 import java.util.HashMap;
   8 import java.util.Hashtable;
   9 import java.util.List;
  10 import java.util.Map;
  11 import java.util.Map.Entry;
  12 import java.util.TreeMap;
  13
  14 import jalview.bin.Cache;
  15 import jalview.datamodel.DBRefEntry;
  16 import jalview.datamodel.DBRefSource;
  17 import jalview.datamodel.FeatureProperties;
  18 import jalview.datamodel.Mapping;
  19 import jalview.datamodel.Sequence;
  20 import jalview.datamodel.SequenceFeature;
  21 import jalview.datamodel.SequenceI;
  22 import jalview.util.DBRefUtils;
  23 import jalview.util.DnaUtils;
  24 import jalview.util.MapList;
  25 import jalview.util.MappingUtils;
  26
  27 /**
  28  * A base class to support parsing of GenBank, EMBL or DDBJ flat file format
  29  * data. Example files (rather than formal specifications) are provided at
  30  *
  31  * <pre>
  32  * https://ena-docs.readthedocs.io/en/latest/submit/fileprep/flat-file-example.html
  33  * https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
  34  * </pre>
  35  *
  36  * or to compare the same entry, see
  37  *
  38  * <pre>
  39  * https://www.ebi.ac.uk/ena/browser/api/embl/X81322.1
  40  * https://www.ncbi.nlm.nih.gov/nuccore/X81322.1
  41  * </pre>
  42  *
  43  * The feature table part of the file has a common definition, only the start of
  44  * each line is formatted differently in GenBank and EMBL. See
  45  * http://www.insdc.org/files/feature_table.html#7.1.
  46  */
  47 public abstract class FlatFile extends AlignFile
  48 {
  49   protected static final String LOCATION = "location";
  50
  51   protected static final String QUOTE = "\"";
  52
  53   protected static final String DOUBLED_QUOTE = QUOTE + QUOTE;
  54
  55   protected static final String WHITESPACE = "\\s+";
  56
  57   /**
  58    * Removes leading or trailing double quotes (") unless doubled, and changes
  59    * any 'escaped' (doubled) double quotes to single characters. As per the
  60    * Feature Table specification for Qualifiers, Free Text.
  61    *
  62    * @param value
  63    * @return
  64    */
  65   protected static String removeQuotes(String value)
  66   {
  67     if (value == null)
  68     {
  69       return null;
  70     }
  71     if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE))
  72     {
  73       value = value.substring(1);
  74     }
  75     if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE))
  76     {
  77       value = value.substring(0, value.length() - 1);
  78     }
  79     value = value.replace(DOUBLED_QUOTE, QUOTE);
  80     return value;
  81   }
  82
  83   /**
  84    * Truncates (if necessary) the exon intervals to match 3 times the length of
  85    * the protein; also accepts 3 bases longer (for stop codon not included in
  86    * protein)
  87    *
  88    * @param proteinLength
  89    * @param exon
  90    *          an array of [start, end, start, end...] intervals
  91    * @return the same array (if unchanged) or a truncated copy
  92    */
  93   protected static int[] adjustForProteinLength(int proteinLength,
  94           int[] exon)
  95   {
  96     if (proteinLength <= 0 || exon == null)
  97     {
  98       return exon;
  99     }
 100     int expectedCdsLength = proteinLength * 3;
 101     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 102
 103     /*
 104      * if exon length matches protein, or is shorter, or longer by the
 105      * length of a stop codon (3 bases), then leave it unchanged
 106      */
 107     if (expectedCdsLength >= exonLength
 108             || expectedCdsLength == exonLength - 3)
 109     {
 110       return exon;
 111     }
 112
 113     int origxon[];
 114     int sxpos = -1;
 115     int endxon = 0;
 116     origxon = new int[exon.length];
 117     System.arraycopy(exon, 0, origxon, 0, exon.length);
 118     int cdspos = 0;
 119     for (int x = 0; x < exon.length; x += 2)
 120     {
 121       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 122       if (expectedCdsLength <= cdspos)
 123       {
 124         // advanced beyond last codon.
 125         sxpos = x;
 126         if (expectedCdsLength != cdspos)
 127         {
 128           // System.err
 129           // .println("Truncating final exon interval on region by "
 130           // + (cdspos - cdslength));
 131         }
 132
 133         /*
 134          * shrink the final exon - reduce end position if forward
 135          * strand, increase it if reverse
 136          */
 137         if (exon[x + 1] >= exon[x])
 138         {
 139           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 140         }
 141         else
 142         {
 143           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 144         }
 145         break;
 146       }
 147     }
 148
 149     if (sxpos != -1)
 150     {
 151       // and trim the exon interval set if necessary
 152       int[] nxon = new int[sxpos + 2];
 153       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 154       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 155                                 // set
 156       exon = nxon;
 157     }
 158     return exon;
 159   }
 160
 161   /*
 162    * values parsed from the data file
 163    */
 164   protected String sourceDb;
 165
 166   protected String accession;
 167
 168   protected String version;
 169
 170   protected String description;
 171
 172   protected int length = 128;
 173
 174   protected List<DBRefEntry> dbrefs;
 175
 176   protected String sequenceString;
 177
 178   protected Map<String, CdsData> cds;
 179
 180   /**
 181    * Constructor
 182    *
 183    * @param fp
 184    * @param sourceId
 185    * @throws IOException
 186    */
 187   public FlatFile(FileParse fp, String sourceId) throws IOException
 188   {
 189     super(false, fp); // don't parse immediately
 190     this.sourceDb = sourceId;
 191     dbrefs = new ArrayList<>();
 192
 193     /*
 194      * using TreeMap gives CDS sequences in alphabetical, so readable, order
 195      */
 196     cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
 197
 198     parse();
 199   }
 200
 201   /**
 202    * Parses one (GenBank or EMBL format) CDS feature, saves the parsed data, and
 203    * returns the next line
 204    *
 205    * @param location
 206    * @return
 207    * @throws IOException
 208    */
 209   protected String parseCDSFeature(String location) throws IOException
 210   {
 211     String line;
 212
 213     /*
 214      * parse location, which can be over >1 line e.g. EAW51554
 215      */
 216     CdsData data = new CdsData();
 217     StringBuilder sb = new StringBuilder().append(location);
 218     line = parseFeatureQualifier(sb, false);
 219     data.cdsLocation = sb.toString();
 220
 221     while (line != null)
 222     {
 223       if (!isFeatureContinuationLine(line))
 224       {
 225         // e.g. start of next feature "FT source..."
 226         break;
 227       }
 228
 229       /*
 230        * extract qualifier, e.g. FT    /protein_id="CAA37824.1"
 231        * - the value may extend over more than one line
 232        * - if the value has enclosing quotes, these are removed
 233        * - escaped double quotes ("") are reduced to a single character
 234        */
 235       int slashPos = line.indexOf('/');
 236       if (slashPos == -1)
 237       {
 238         Cache.log.error("Unexpected EMBL line ignored: " + line);
 239         line = nextLine();
 240         continue;
 241       }
 242       int eqPos = line.indexOf('=', slashPos + 1);
 243       if (eqPos == -1)
 244       {
 245         // can happen, e.g. /ribosomal_slippage
 246         line = nextLine();
 247         continue;
 248       }
 249       String qualifier = line.substring(slashPos + 1, eqPos);
 250       String value = line.substring(eqPos + 1);
 251       value = removeQuotes(value);
 252       sb = new StringBuilder().append(value);
 253       boolean asText = !"translation".equals(qualifier);
 254       line = parseFeatureQualifier(sb, asText);
 255       String featureValue = sb.toString();
 256
 257       if ("protein_id".equals(qualifier))
 258       {
 259         data.proteinId = featureValue;
 260       }
 261       else if ("codon_start".equals(qualifier))
 262       {
 263         try
 264         {
 265           data.codonStart = Integer.parseInt(featureValue.trim());
 266         } catch (NumberFormatException e)
 267         {
 268           Cache.log.error("Invalid codon_start in XML for " + this.accession
 269                   + ": " + e.getMessage());
 270         }
 271       }
 272       else if ("db_xref".equals(qualifier))
 273       {
 274         String[] parts = featureValue.split(":");
 275         if (parts.length == 2)
 276         {
 277           String db = parts[0].trim();
 278           db = DBRefUtils.getCanonicalName(db);
 279           DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
 280           data.xrefs.add(dbref);
 281         }
 282       }
 283       else if ("product".equals(qualifier))
 284       {
 285         data.proteinName = featureValue;
 286       }
 287       else if ("translation".equals(qualifier))
 288       {
 289         data.translation = featureValue;
 290       }
 291       else if (!"".equals(featureValue))
 292       {
 293         // throw anything else into the additional properties hash
 294         data.cdsProps.put(qualifier, featureValue);
 295       }
 296     }
 297
 298     if (data.proteinId != null)
 299     {
 300       this.cds.put(data.proteinId, data);
 301     }
 302     else
 303     {
 304       Cache.log.error("Ignoring CDS feature with no protein_id for "
 305               + sourceDb + ":" + accession);
 306     }
 307
 308     return line;
 309   }
 310
 311   protected abstract boolean isFeatureContinuationLine(String line);
 312
 313   /**
 314    * Output (print) is not (yet) implemented for flat file format
 315    */
 316   @Override
 317   public String print(SequenceI[] seqs, boolean jvsuffix)
 318   {
 319     return null;
 320   }
 321
 322   /**
 323    * Constructs and saves the sequence from parsed components
 324    */
 325   protected void buildSequence()
 326   {
 327     if (this.accession == null || this.sequenceString == null)
 328     {
 329       Cache.log.error("Failed to parse data from EMBL");
 330       return;
 331     }
 332
 333     String name = this.accession;
 334     if (this.sourceDb != null)
 335     {
 336       name = this.sourceDb + "|" + name;
 337     }
 338     SequenceI seq = new Sequence(name, this.sequenceString);
 339     seq.setDescription(this.description);
 340
 341     /*
 342      * add a DBRef to itself
 343      */
 344     DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
 345     int[] startEnd = new int[] { 1, seq.getLength() };
 346     selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
 347     seq.addDBRef(selfRef);
 348
 349     for (DBRefEntry dbref : this.dbrefs)
 350     {
 351       seq.addDBRef(dbref);
 352     }
 353
 354     processCDSFeatures(seq);
 355
 356     seq.deriveSequence();
 357
 358     addSequence(seq);
 359   }
 360
 361   /**
 362    * Process the CDS features, including generation of cross-references and
 363    * mappings to the protein products (translation)
 364    *
 365    * @param seq
 366    */
 367   protected void processCDSFeatures(SequenceI seq)
 368   {
 369     /*
 370      * record protein products found to avoid duplication i.e. >1 CDS with
 371      * the same /protein_id [though not sure I can find an example of this]
 372      */
 373     Map<String, SequenceI> proteins = new HashMap<>();
 374     for (CdsData data : cds.values())
 375     {
 376       processCDSFeature(seq, data, proteins);
 377     }
 378   }
 379
 380   /**
 381    * Processes data for one parsed CDS feature to
 382    * <ul>
 383    * <li>create a protein product sequence for the translation</li>
 384    * <li>create a cross-reference to protein with mapping from dna</li>
 385    * <li>add a CDS feature to the sequence for each CDS start-end range</li>
 386    * <li>add any CDS dbrefs to the sequence and to the protein product</li>
 387    * </ul>
 388    *
 389    * @param SequenceI
 390    *          dna
 391    * @param proteins
 392    *          map of protein products so far derived from CDS data
 393    */
 394   void processCDSFeature(SequenceI dna, CdsData data,
 395           Map<String, SequenceI> proteins)
 396   {
 397     /*
 398      * parse location into a list of [start, end, start, end] positions
 399      */
 400     int[] exons = getCdsRanges(this.accession, data.cdsLocation);
 401
 402     MapList maplist = buildMappingToProtein(dna, exons, data);
 403
 404     int exonNumber = 0;
 405
 406     for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
 407     {
 408       int exonStart = exons[xint];
 409       int exonEnd = exons[xint + 1];
 410       int begin = Math.min(exonStart, exonEnd);
 411       int end = Math.max(exonStart, exonEnd);
 412       exonNumber++;
 413       String desc = String.format("Exon %d for protein EMBLCDS:%s",
 414               exonNumber, data.proteinId);
 415
 416       SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
 417               this.sourceDb);
 418       for (Entry<String, String> val : data.cdsProps.entrySet())
 419       {
 420         sf.setValue(val.getKey(), val.getValue());
 421       }
 422
 423       sf.setEnaLocation(data.cdsLocation);
 424       boolean forwardStrand = exonStart <= exonEnd;
 425       sf.setStrand(forwardStrand ? "+" : "-");
 426       sf.setPhase(String.valueOf(data.codonStart - 1));
 427       sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 428       sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
 429
 430       dna.addSequenceFeature(sf);
 431     }
 432
 433     boolean hasUniprotDbref = false;
 434     for (DBRefEntry xref : data.xrefs)
 435     {
 436       dna.addDBRef(xref);
 437       if (xref.getSource().equals(DBRefSource.UNIPROT))
 438       {
 439         /*
 440          * construct (or find) the sequence for (data.protein_id, data.translation)
 441          */
 442         SequenceI protein = buildProteinProduct(dna, xref, data, proteins);
 443         Mapping map = new Mapping(protein, maplist);
 444         map.setMappedFromId(data.proteinId);
 445         xref.setMap(map);
 446
 447         /*
 448          * add DBRefs with mappings from dna to protein and the inverse
 449          */
 450         DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession);
 451         db1.setMap(new Mapping(dna, maplist.getInverse()));
 452         protein.addDBRef(db1);
 453
 454         hasUniprotDbref = true;
 455       }
 456     }
 457
 458     /*
 459      * if we have a product (translation) but no explicit Uniprot dbref
 460      * (example: EMBL M19487 protein_id AAB02592.1)
 461      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 462      */
 463     if (!hasUniprotDbref)
 464     {
 465       SequenceI protein = proteins.get(data.proteinId);
 466       if (protein == null)
 467       {
 468         protein = new Sequence(data.proteinId, data.translation);
 469         protein.setDescription(data.proteinName);
 470         proteins.put(data.proteinId, protein);
 471       }
 472       // assuming CDSPROTEIN sequence version = dna version (?!)
 473       DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 474               this.version, data.proteinId);
 475       protein.addDBRef(db1);
 476
 477       DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 478               DBRefSource.EMBLCDSProduct, this.version, data.proteinId);
 479       Mapping map = new Mapping(protein, maplist);
 480       map.setMappedFromId(data.proteinId);
 481       dnaToEmblProteinRef.setMap(map);
 482       dna.addDBRef(dnaToEmblProteinRef);
 483     }
 484
 485     /*
 486      * comment brought forward from EmblXmlSource, lines 447-451:
 487      * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL
 488      * sequence with the exon  map; if given a dataset reference, search
 489      * dataset for parent EMBL sequence if it exists and set its map;
 490      * make a new feature annotating the coding contig
 491      */
 492   }
 493
 494   /**
 495    * Computes a mapping from CDS positions in DNA sequence to protein product
 496    * positions, with allowance for stop codon or incomplete start codon
 497    *
 498    * @param dna
 499    * @param exons
 500    * @param data
 501    * @return
 502    */
 503   MapList buildMappingToProtein(final SequenceI dna, final int[] exons,
 504           final CdsData data)
 505   {
 506     MapList dnaToProteinMapping = null;
 507     int peptideLength = data.translation.length();
 508
 509     int[] proteinRange = new int[] { 1, peptideLength };
 510     if (exons != null && exons.length > 0)
 511     {
 512       /*
 513        * We were able to parse 'location'; do a final
 514        * product length truncation check
 515        */
 516       int[] cdsRanges = adjustForProteinLength(peptideLength, exons);
 517       dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 518     }
 519     else
 520     {
 521       /*
 522        * workaround until we handle all 'location' formats fully
 523        * e.g. X53828.1:60..1058 or <123..>289
 524        */
 525       Cache.log.error(String.format(
 526               "Implementation Notice: EMBLCDS location '%s'not properly supported yet"
 527                       + " - Making up the CDNA region of (%s:%s)... may be incorrect",
 528               data.cdsLocation, sourceDb, this.accession));
 529
 530       int completeCodonsLength = 1 - data.codonStart + dna.getLength();
 531       int mappedDnaEnd = dna.getEnd();
 532       if (peptideLength * 3 == completeCodonsLength)
 533       {
 534         // this might occur for CDS sequences where no features are marked
 535         Cache.log.warn("Assuming no stop codon at end of cDNA fragment");
 536         mappedDnaEnd = dna.getEnd();
 537       }
 538       else if ((peptideLength + 1) * 3 == completeCodonsLength)
 539       {
 540         Cache.log.warn("Assuming stop codon at end of cDNA fragment");
 541         mappedDnaEnd = dna.getEnd() - 3;
 542       }
 543
 544       if (mappedDnaEnd != -1)
 545       {
 546         int[] cdsRanges = new int[] {
 547             dna.getStart() + (data.codonStart - 1), mappedDnaEnd };
 548         dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 549       }
 550     }
 551
 552     return dnaToProteinMapping;
 553   }
 554
 555   /**
 556    * Constructs a sequence for the protein product for the CDS data (if there is
 557    * one), and dbrefs with mappings from CDS to protein and the reverse
 558    *
 559    * @param dna
 560    * @param xref
 561    * @param data
 562    * @param proteins
 563    * @return
 564    */
 565   SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref,
 566           CdsData data, Map<String, SequenceI> proteins)
 567   {
 568     /*
 569      * check we have some data to work with
 570      */
 571     if (data.proteinId == null || data.translation == null)
 572     {
 573       return null;
 574     }
 575
 576     /*
 577      * Construct the protein sequence (if not already seen)
 578      */
 579     String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId();
 580     SequenceI protein = proteins.get(proteinSeqName);
 581     if (protein == null)
 582     {
 583       protein = new Sequence(proteinSeqName, data.translation, 1,
 584               data.translation.length());
 585       protein.setDescription(data.proteinName != null ? data.proteinName
 586               : "Protein Product from " + sourceDb);
 587       proteins.put(proteinSeqName, protein);
 588     }
 589
 590     return protein;
 591   }
 592
 593   /**
 594    * Returns the CDS location as a single array of [start, end, start, end...]
 595    * positions. If on the reverse strand, these will be in descending order.
 596    *
 597    * @param accession
 598    * @param location
 599    * @return
 600    */
 601   protected int[] getCdsRanges(String accession, String location)
 602   {
 603     if (location == null)
 604     {
 605       return new int[] {};
 606     }
 607
 608     try
 609     {
 610       List<int[]> ranges = DnaUtils.parseLocation(location);
 611       return MappingUtils.listToArray(ranges);
 612     } catch (ParseException e)
 613     {
 614       Cache.log.warn(
 615               String.format("Not parsing inexact CDS location %s in ENA %s",
 616                       location, accession));
 617       return new int[] {};
 618     }
 619   }
 620
 621   /**
 622    * Reads the value of a feature (FT) qualifier from one or more lines of the
 623    * file, and returns the next line after that. Values are appended to the
 624    * string buffer, which should be already primed with the value read from the
 625    * first line for the qualifier (with any leading double quote removed).
 626    * Enclosing double quotes are removed, and escaped (repeated) double quotes
 627    * reduced to one only. For example for
 628    *
 629    * <pre>
 630    * FT      /note="gene_id=hCG28070.3
 631    * FT      ""foobar"" isoform=CRA_b"
 632    * the returned value is
 633    * gene_id=hCG28070.3 "foobar" isoform=CRA_b
 634    * </pre>
 635    *
 636    * Note the side-effect of this method, to advance data reading to the next
 637    * line after the feature qualifier (which could be another qualifier, a
 638    * different feature, a non-feature line, or null at end of file).
 639    *
 640    * @param sb
 641    *          a string buffer primed with the first line of the value
 642    * @param asText
 643    * @return
 644    * @throws IOException
 645    */
 646   String parseFeatureQualifier(StringBuilder sb, boolean asText)
 647           throws IOException
 648   {
 649     String line;
 650     while ((line = nextLine()) != null)
 651     {
 652       if (!isFeatureContinuationLine(line))
 653       {
 654         break; // reached next feature or other input line
 655       }
 656       String[] tokens = line.split(WHITESPACE);
 657       if (tokens.length < 2)
 658       {
 659         Cache.log.error("Ignoring bad EMBL line for " + this.accession
 660                 + ": " + line);
 661         break;
 662       }
 663       if (tokens[1].startsWith("/"))
 664       {
 665         break; // next feature qualifier
 666       }
 667
 668       /*
 669        * if text (e.g. /product), add a word separator for a new line,
 670        * else (e.g. /translation) don't
 671        */
 672       if (asText)
 673       {
 674         sb.append(" ");
 675       }
 676
 677       /*
 678        * remove trailing " and unescape doubled ""
 679        */
 680       String data = removeQuotes(tokens[1]);
 681       sb.append(data);
 682     }
 683
 684     return line;
 685   }
 686
 687   /**
 688    * Reads and saves the sequence, read from the lines following the ORIGIN
 689    * (GenBank) or SQ (EMBL) line. Whitespace and position counters are
 690    * discarded. Returns the next line following the sequence data (the next line
 691    * that doesn't start with whitespace).
 692    *
 693    * @throws IOException
 694    */
 695   protected String parseSequence() throws IOException
 696   {
 697     StringBuilder sb = new StringBuilder(this.length);
 698     String line = nextLine();
 699     while (line != null && line.startsWith(" "))
 700     {
 701       line = line.trim();
 702       String[] blocks = line.split(WHITESPACE);
 703
 704       /*
 705        * the first or last block on each line might be a position count - omit
 706        */
 707       for (int i = 0; i < blocks.length; i++)
 708       {
 709         try
 710         {
 711           Long.parseLong(blocks[i]);
 712           // position counter - ignore it
 713         } catch (NumberFormatException e)
 714         {
 715           // sequence data - append it
 716           sb.append(blocks[i]);
 717         }
 718       }
 719       line = nextLine();
 720     }
 721     this.sequenceString = sb.toString();
 722
 723     return line;
 724   }
 725
 726   /**
 727    * Processes a feature line. If it declares a feature type of interest
 728    * (currently, only CDS is processed), processes all of the associated lines
 729    * (feature qualifiers), and returns the next line after that, otherwise
 730    * simply returns the next line.
 731    *
 732    * @param line
 733    *          the first line for the feature (with initial FT omitted for EMBL
 734    *          format)
 735    * @return
 736    * @throws IOException
 737    */
 738   protected String parseFeature(String line) throws IOException
 739   {
 740     String[] tokens = line.trim().split(WHITESPACE);
 741     if (tokens.length < 2 || !"CDS".equals(tokens[0]))
 742     {
 743       return nextLine();
 744     }
 745
 746     return parseCDSFeature(tokens[1]);
 747   }
 748 }
 749
 750 /**
 751  * A data bean class to hold values parsed from one CDS Feature
 752  */
 753 class CdsData
 754 {
 755   String translation; // from /translation qualifier
 756
 757   String cdsLocation; // the raw value e.g. join(1..1234,2012..2837)
 758
 759   int codonStart = 1; // from /codon_start qualifier
 760
 761   String proteinName; // from /product qualifier; used for protein description
 762
 763   String proteinId; // from /protein_id qualifier
 764
 765   List<DBRefEntry> xrefs = new ArrayList<>(); // from /db_xref qualifiers
 766
 767   Map<String, String> cdsProps = new Hashtable<>(); // other qualifiers
 768 }