src/jalview/io/FlatFile.java

   1 package jalview.io;
   2
   3 import java.io.IOException;
   4 import java.text.ParseException;
   5 import java.util.ArrayList;
   6 import java.util.Arrays;
   7 import java.util.HashMap;
   8 import java.util.Hashtable;
   9 import java.util.List;
  10 import java.util.Map;
  11 import java.util.Map.Entry;
  12 import java.util.TreeMap;
  13
  14 import jalview.bin.Cache;
  15 import jalview.datamodel.DBRefEntry;
  16 import jalview.datamodel.DBRefSource;
  17 import jalview.datamodel.FeatureProperties;
  18 import jalview.datamodel.Mapping;
  19 import jalview.datamodel.Sequence;
  20 import jalview.datamodel.SequenceFeature;
  21 import jalview.datamodel.SequenceI;
  22 import jalview.util.DBRefUtils;
  23 import jalview.util.DnaUtils;
  24 import jalview.util.MapList;
  25 import jalview.util.MappingUtils;
  26
  27 /**
  28  * A base class to support parsing of GenBank, EMBL or DDBJ flat file format
  29  * data. Example files (rather than formal specifications) are provided at
  30  *
  31  * <pre>
  32  * https://ena-docs.readthedocs.io/en/latest/submit/fileprep/flat-file-example.html
  33  * https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
  34  * </pre>
  35  *
  36  * or to compare the same entry, see
  37  *
  38  * <pre>
  39  * https://www.ebi.ac.uk/ena/browser/api/embl/X81322.1
  40  * https://www.ncbi.nlm.nih.gov/nuccore/X81322.1
  41  * </pre>
  42  *
  43  * The feature table part of the file has a common definition, only the start of
  44  * each line is formatted differently in GenBank and EMBL. See
  45  * http://www.insdc.org/files/feature_table.html#7.1.
  46  */
  47 public abstract class FlatFile extends AlignFile
  48 {
  49   protected static final String LOCATION = "location";
  50
  51   protected static final String QUOTE = "\"";
  52
  53   protected static final String DOUBLED_QUOTE = QUOTE + QUOTE;
  54
  55   protected static final String WHITESPACE = "\\s+";
  56
  57   /**
  58    * Removes leading or trailing double quotes (") unless doubled, and changes
  59    * any 'escaped' (doubled) double quotes to single characters. As per the
  60    * Feature Table specification for Qualifiers, Free Text.
  61    *
  62    * @param value
  63    * @return
  64    */
  65   protected static String removeQuotes(String value)
  66   {
  67     if (value == null)
  68     {
  69       return null;
  70     }
  71     if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE))
  72     {
  73       value = value.substring(1);
  74     }
  75     if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE))
  76     {
  77       value = value.substring(0, value.length() - 1);
  78     }
  79     value = value.replace(DOUBLED_QUOTE, QUOTE);
  80     return value;
  81   }
  82
  83   /**
  84    * Truncates (if necessary) the exon intervals to match 3 times the length of
  85    * the protein; also accepts 3 bases longer (for stop codon not included in
  86    * protein)
  87    *
  88    * @param proteinLength
  89    * @param exon
  90    *          an array of [start, end, start, end...] intervals
  91    * @return the same array (if unchanged) or a truncated copy
  92    */
  93   protected static int[] adjustForProteinLength(int proteinLength,
  94           int[] exon)
  95   {
  96     if (proteinLength <= 0 || exon == null)
  97     {
  98       return exon;
  99     }
 100     int expectedCdsLength = proteinLength * 3;
 101     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 102
 103     /*
 104      * if exon length matches protein, or is shorter, or longer by the
 105      * length of a stop codon (3 bases), then leave it unchanged
 106      */
 107     if (expectedCdsLength >= exonLength
 108             || expectedCdsLength == exonLength - 3)
 109     {
 110       return exon;
 111     }
 112
 113     int origxon[];
 114     int sxpos = -1;
 115     int endxon = 0;
 116     origxon = new int[exon.length];
 117     System.arraycopy(exon, 0, origxon, 0, exon.length);
 118     int cdspos = 0;
 119     for (int x = 0; x < exon.length; x += 2)
 120     {
 121       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 122       if (expectedCdsLength <= cdspos)
 123       {
 124         // advanced beyond last codon.
 125         sxpos = x;
 126         if (expectedCdsLength != cdspos)
 127         {
 128           // System.err
 129           // .println("Truncating final exon interval on region by "
 130           // + (cdspos - cdslength));
 131         }
 132
 133         /*
 134          * shrink the final exon - reduce end position if forward
 135          * strand, increase it if reverse
 136          */
 137         if (exon[x + 1] >= exon[x])
 138         {
 139           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 140         }
 141         else
 142         {
 143           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 144         }
 145         break;
 146       }
 147     }
 148
 149     if (sxpos != -1)
 150     {
 151       // and trim the exon interval set if necessary
 152       int[] nxon = new int[sxpos + 2];
 153       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 154       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 155                                 // set
 156       exon = nxon;
 157     }
 158     return exon;
 159   }
 160
 161   /*
 162    * values parsed from the data file
 163    */
 164   protected String sourceDb;
 165
 166   protected String accession;
 167
 168   protected String version;
 169
 170   protected String description;
 171
 172   protected int length = 128;
 173
 174   protected List<DBRefEntry> dbrefs;
 175
 176   protected String sequenceString;
 177
 178   protected Map<String, CdsData> cds;
 179
 180   /**
 181    * Constructor
 182    *
 183    * @param fp
 184    * @param sourceId
 185    * @throws IOException
 186    */
 187   public FlatFile(FileParse fp, String sourceId) throws IOException
 188   {
 189     super(false, fp); // don't parse immediately
 190     this.sourceDb = sourceId;
 191     dbrefs = new ArrayList<>();
 192
 193     /*
 194      * using TreeMap gives CDS sequences in alphabetical, so readable, order
 195      */
 196     cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
 197   }
 198
 199   /**
 200    * Parses one (GenBank or EMBL format) CDS feature, saves the parsed data, and
 201    * returns the next line
 202    *
 203    * @param location
 204    * @return
 205    * @throws IOException
 206    */
 207   protected String parseCDSFeature(String location) throws IOException
 208   {
 209     String line;
 210
 211     /*
 212      * parse location, which can be over >1 line e.g. EAW51554
 213      */
 214     CdsData data = new CdsData();
 215     StringBuilder sb = new StringBuilder().append(location);
 216     line = parseFeatureQualifier(sb, LOCATION);
 217     data.cdsLocation = sb.toString();
 218
 219     while (line != null)
 220     {
 221       if (!isFeatureContinuationLine(line))
 222       {
 223         // e.g. start of next feature "FT source..."
 224         break;
 225       }
 226
 227       /*
 228        * extract qualifier, e.g. FT    /protein_id="CAA37824.1"
 229        * - the value may extend over more than one line
 230        * - if the value has enclosing quotes, these are removed
 231        * - escaped double quotes ("") are reduced to a single character
 232        */
 233       int slashPos = line.indexOf('/');
 234       if (slashPos == -1)
 235       {
 236         Cache.log.error("Unexpected EMBL line ignored: " + line);
 237         line = nextLine();
 238         continue;
 239       }
 240       int eqPos = line.indexOf('=', slashPos + 1);
 241       if (eqPos == -1)
 242       {
 243         // can happen, e.g. /ribosomal_slippage
 244         line = nextLine();
 245         continue;
 246       }
 247       String qualifier = line.substring(slashPos + 1, eqPos);
 248       String value = line.substring(eqPos + 1);
 249       value = removeQuotes(value);
 250       sb = new StringBuilder().append(value);
 251       line = parseFeatureQualifier(sb, qualifier);
 252       String featureValue = sb.toString();
 253
 254       if ("protein_id".equals(qualifier))
 255       {
 256         data.proteinId = featureValue;
 257       }
 258       else if ("codon_start".equals(qualifier))
 259       {
 260         try
 261         {
 262           data.codonStart = Integer.parseInt(featureValue.trim());
 263         } catch (NumberFormatException e)
 264         {
 265           Cache.log.error("Invalid codon_start in XML for " + this.accession
 266                   + ": " + e.getMessage());
 267         }
 268       }
 269       else if ("db_xref".equals(qualifier))
 270       {
 271         String[] parts = featureValue.split(":");
 272         if (parts.length == 2)
 273         {
 274           String db = parts[0].trim();
 275           db = DBRefUtils.getCanonicalName(db);
 276           DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
 277           data.xrefs.add(dbref);
 278         }
 279       }
 280       else if ("product".equals(qualifier))
 281       {
 282         data.proteinName = featureValue;
 283       }
 284       else if ("translation".equals(qualifier))
 285       {
 286         data.translation = featureValue;
 287       }
 288       else if (!"".equals(featureValue))
 289       {
 290         // throw anything else into the additional properties hash
 291         data.cdsProps.put(qualifier, featureValue);
 292       }
 293     }
 294
 295     if (data.proteinId != null)
 296     {
 297       this.cds.put(data.proteinId, data);
 298     }
 299     else
 300     {
 301       Cache.log.error("Ignoring CDS feature with no protein_id for "
 302               + sourceDb + ":" + accession);
 303     }
 304
 305     return line;
 306   }
 307
 308   protected abstract boolean isFeatureContinuationLine(String line);
 309
 310   /**
 311    * Output (print) is not (yet) implemented for flat file format
 312    */
 313   @Override
 314   public String print(SequenceI[] seqs, boolean jvsuffix)
 315   {
 316     return null;
 317   }
 318
 319   /**
 320    * Constructs and saves the sequence from parsed components
 321    */
 322   protected void buildSequence()
 323   {
 324     if (this.accession == null || this.sequenceString == null)
 325     {
 326       Cache.log.error("Failed to parse data from EMBL");
 327       return;
 328     }
 329
 330     String name = this.accession;
 331     if (this.sourceDb != null)
 332     {
 333       name = this.sourceDb + "|" + name;
 334     }
 335     SequenceI seq = new Sequence(name, this.sequenceString);
 336     seq.setDescription(this.description);
 337
 338     /*
 339      * add a DBRef to itself
 340      */
 341     DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
 342     int[] startEnd = new int[] { 1, seq.getLength() };
 343     selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
 344     seq.addDBRef(selfRef);
 345
 346     for (DBRefEntry dbref : this.dbrefs)
 347     {
 348       seq.addDBRef(dbref);
 349     }
 350
 351     processCDSFeatures(seq);
 352
 353     seq.deriveSequence();
 354
 355     addSequence(seq);
 356   }
 357
 358   /**
 359    * Process the CDS features, including generation of cross-references and
 360    * mappings to the protein products (translation)
 361    *
 362    * @param seq
 363    */
 364   protected void processCDSFeatures(SequenceI seq)
 365   {
 366     /*
 367      * record protein products found to avoid duplication i.e. >1 CDS with
 368      * the same /protein_id [though not sure I can find an example of this]
 369      */
 370     Map<String, SequenceI> proteins = new HashMap<>();
 371     for (CdsData data : cds.values())
 372     {
 373       processCDSFeature(seq, data, proteins);
 374     }
 375   }
 376
 377   /**
 378    * Processes data for one parsed CDS feature to
 379    * <ul>
 380    * <li>create a protein product sequence for the translation</li>
 381    * <li>create a cross-reference to protein with mapping from dna</li>
 382    * <li>add a CDS feature to the sequence for each CDS start-end range</li>
 383    * <li>add any CDS dbrefs to the sequence and to the protein product</li>
 384    * </ul>
 385    *
 386    * @param SequenceI
 387    *          dna
 388    * @param proteins
 389    *          map of protein products so far derived from CDS data
 390    */
 391   void processCDSFeature(SequenceI dna, CdsData data,
 392           Map<String, SequenceI> proteins)
 393   {
 394     /*
 395      * parse location into a list of [start, end, start, end] positions
 396      */
 397     int[] exons = getCdsRanges(this.accession, data.cdsLocation);
 398
 399     MapList maplist = buildMappingToProtein(dna, exons, data);
 400
 401     int exonNumber = 0;
 402
 403     for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
 404     {
 405       int exonStart = exons[xint];
 406       int exonEnd = exons[xint + 1];
 407       int begin = Math.min(exonStart, exonEnd);
 408       int end = Math.max(exonStart, exonEnd);
 409       exonNumber++;
 410       String desc = String.format("Exon %d for protein EMBLCDS:%s",
 411               exonNumber, data.proteinId);
 412
 413       SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
 414               this.sourceDb);
 415       for (Entry<String, String> val : data.cdsProps.entrySet())
 416       {
 417         sf.setValue(val.getKey(), val.getValue());
 418       }
 419
 420       sf.setEnaLocation(data.cdsLocation);
 421       boolean forwardStrand = exonStart <= exonEnd;
 422       sf.setStrand(forwardStrand ? "+" : "-");
 423       sf.setPhase(String.valueOf(data.codonStart - 1));
 424       sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 425       sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
 426
 427       dna.addSequenceFeature(sf);
 428     }
 429
 430     boolean hasUniprotDbref = false;
 431     for (DBRefEntry xref : data.xrefs)
 432     {
 433       dna.addDBRef(xref);
 434       if (xref.getSource().equals(DBRefSource.UNIPROT))
 435       {
 436         /*
 437          * construct (or find) the sequence for (data.protein_id, data.translation)
 438          */
 439         SequenceI protein = buildProteinProduct(dna, xref, data, proteins);
 440         Mapping map = new Mapping(protein, maplist);
 441         map.setMappedFromId(data.proteinId);
 442         xref.setMap(map);
 443
 444         /*
 445          * add DBRefs with mappings from dna to protein and the inverse
 446          */
 447         DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession);
 448         db1.setMap(new Mapping(dna, maplist.getInverse()));
 449         protein.addDBRef(db1);
 450
 451         hasUniprotDbref = true;
 452       }
 453     }
 454
 455     /*
 456      * if we have a product (translation) but no explicit Uniprot dbref
 457      * (example: EMBL M19487 protein_id AAB02592.1)
 458      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 459      */
 460     if (!hasUniprotDbref)
 461     {
 462       SequenceI protein = proteins.get(data.proteinId);
 463       if (protein == null)
 464       {
 465         protein = new Sequence(data.proteinId, data.translation);
 466         protein.setDescription(data.proteinName);
 467         proteins.put(data.proteinId, protein);
 468       }
 469       // assuming CDSPROTEIN sequence version = dna version (?!)
 470       DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 471               this.version, data.proteinId);
 472       protein.addDBRef(db1);
 473
 474       DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 475               DBRefSource.EMBLCDSProduct, this.version, data.proteinId);
 476       Mapping map = new Mapping(protein, maplist);
 477       map.setMappedFromId(data.proteinId);
 478       dnaToEmblProteinRef.setMap(map);
 479       dna.addDBRef(dnaToEmblProteinRef);
 480     }
 481
 482     /*
 483      * comment brought forward from EmblXmlSource, lines 447-451:
 484      * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL
 485      * sequence with the exon  map; if given a dataset reference, search
 486      * dataset for parent EMBL sequence if it exists and set its map;
 487      * make a new feature annotating the coding contig
 488      */
 489   }
 490
 491   /**
 492    * Computes a mapping from CDS positions in DNA sequence to protein product
 493    * positions, with allowance for stop codon or incomplete start codon
 494    *
 495    * @param dna
 496    * @param exons
 497    * @param data
 498    * @return
 499    */
 500   MapList buildMappingToProtein(final SequenceI dna, final int[] exons,
 501           final CdsData data)
 502   {
 503     MapList dnaToProteinMapping = null;
 504     int peptideLength = data.translation.length();
 505
 506     int[] proteinRange = new int[] { 1, peptideLength };
 507     if (exons != null && exons.length > 0)
 508     {
 509       /*
 510        * We were able to parse 'location'; do a final
 511        * product length truncation check
 512        */
 513       int[] cdsRanges = adjustForProteinLength(peptideLength, exons);
 514       dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 515     }
 516     else
 517     {
 518       /*
 519        * workaround until we handle all 'location' formats fully
 520        * e.g. X53828.1:60..1058 or <123..>289
 521        */
 522       Cache.log.error(String.format(
 523               "Implementation Notice: EMBLCDS location '%s'not properly supported yet"
 524                       + " - Making up the CDNA region of (%s:%s)... may be incorrect",
 525               data.cdsLocation, sourceDb, this.accession));
 526
 527       int completeCodonsLength = 1 - data.codonStart + dna.getLength();
 528       int mappedDnaEnd = dna.getEnd();
 529       if (peptideLength * 3 == completeCodonsLength)
 530       {
 531         // this might occur for CDS sequences where no features are marked
 532         Cache.log.warn("Assuming no stop codon at end of cDNA fragment");
 533         mappedDnaEnd = dna.getEnd();
 534       }
 535       else if ((peptideLength + 1) * 3 == completeCodonsLength)
 536       {
 537         Cache.log.warn("Assuming stop codon at end of cDNA fragment");
 538         mappedDnaEnd = dna.getEnd() - 3;
 539       }
 540
 541       if (mappedDnaEnd != -1)
 542       {
 543         int[] cdsRanges = new int[] {
 544             dna.getStart() + (data.codonStart - 1), mappedDnaEnd };
 545         dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 546       }
 547     }
 548
 549     return dnaToProteinMapping;
 550   }
 551
 552   /**
 553    * Constructs a sequence for the protein product for the CDS data (if there is
 554    * one), and dbrefs with mappings from CDS to protein and the reverse
 555    *
 556    * @param dna
 557    * @param xref
 558    * @param data
 559    * @param proteins
 560    * @return
 561    */
 562   SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref,
 563           CdsData data, Map<String, SequenceI> proteins)
 564   {
 565     /*
 566      * check we have some data to work with
 567      */
 568     if (data.proteinId == null || data.translation == null)
 569     {
 570       return null;
 571     }
 572
 573     /*
 574      * Construct the protein sequence (if not already seen)
 575      */
 576     String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId();
 577     SequenceI protein = proteins.get(proteinSeqName);
 578     if (protein == null)
 579     {
 580       protein = new Sequence(proteinSeqName, data.translation, 1,
 581               data.translation.length());
 582       protein.setDescription(data.proteinName != null ? data.proteinName
 583               : "Protein Product from " + sourceDb);
 584       proteins.put(proteinSeqName, protein);
 585     }
 586
 587     return protein;
 588   }
 589
 590   /**
 591    * Returns the CDS location as a single array of [start, end, start, end...]
 592    * positions. If on the reverse strand, these will be in descending order.
 593    *
 594    * @param accession
 595    * @param location
 596    * @return
 597    */
 598   protected int[] getCdsRanges(String accession, String location)
 599   {
 600     if (location == null)
 601     {
 602       return new int[] {};
 603     }
 604
 605     try
 606     {
 607       List<int[]> ranges = DnaUtils.parseLocation(location);
 608       return MappingUtils.rangeListToArray(ranges);
 609     } catch (ParseException e)
 610     {
 611       Cache.log.warn(
 612               String.format("Not parsing inexact CDS location %s in ENA %s",
 613                       location, accession));
 614       return new int[] {};
 615     }
 616   }
 617
 618   /**
 619    * Reads the value of a feature (FT) qualifier from one or more lines of the
 620    * file, and returns the next line after that. Values are appended to the
 621    * string buffer, which should be already primed with the value read from the
 622    * first line for the qualifier (with any leading double quote removed).
 623    * Enclosing double quotes are removed, and escaped (repeated) double quotes
 624    * reduced to one only. For example for
 625    *
 626    * <pre>
 627    * FT      /note="gene_id=hCG28070.3
 628    * FT      ""foobar"" isoform=CRA_b"
 629    * the returned value is
 630    * gene_id=hCG28070.3 "foobar" isoform=CRA_b
 631    * </pre>
 632    *
 633    * Note the side-effect of this method, to advance data reading to the next
 634    * line after the feature qualifier (which could be another qualifier, a
 635    * different feature, a non-feature line, or null at end of file).
 636    *
 637    * @param sb
 638    *          a string buffer primed with the first line of the value
 639    * @param qualifierName
 640    * @return
 641    * @throws IOException
 642    */
 643   String parseFeatureQualifier(StringBuilder sb, String qualifierName)
 644           throws IOException
 645   {
 646     String line;
 647     while ((line = nextLine()) != null)
 648     {
 649       if (!isFeatureContinuationLine(line))
 650       {
 651         break; // reached next feature or other input line
 652       }
 653       String[] tokens = line.split(WHITESPACE);
 654       if (tokens.length < 2)
 655       {
 656         Cache.log.error("Ignoring bad EMBL line for " + this.accession
 657                 + ": " + line);
 658         break;
 659       }
 660       if (tokens[1].startsWith("/"))
 661       {
 662         break; // next feature qualifier
 663       }
 664
 665       /*
 666        * heuristic rule: most multi-line value (e.g. /product) are text,
 667        * so add a space for word boundary at a new line; not for translation
 668        */
 669       if (!"translation".equals(qualifierName)
 670               && !LOCATION.equals(qualifierName))
 671       {
 672         sb.append(" ");
 673       }
 674
 675       /*
 676        * remove trailing " and unescape doubled ""
 677        */
 678       String data = removeQuotes(tokens[1]);
 679       sb.append(data);
 680     }
 681
 682     return line;
 683   }
 684
 685   /**
 686    * Reads and saves the sequence, read from the lines following the ORIGIN
 687    * (GenBank) or SQ (EMBL) line. Whitespace and position counters are
 688    * discarded. Returns the next line following the sequence data (the next line
 689    * that doesn't start with whitespace).
 690    *
 691    * @throws IOException
 692    */
 693   protected String parseSequence() throws IOException
 694   {
 695     StringBuilder sb = new StringBuilder(this.length);
 696     String line = nextLine();
 697     while (line != null && line.startsWith(" "))
 698     {
 699       line = line.trim();
 700       String[] blocks = line.split(WHITESPACE);
 701
 702       /*
 703        * the first or last block on each line might be a position count - omit
 704        */
 705       for (int i = 0; i < blocks.length; i++)
 706       {
 707         try
 708         {
 709           Long.parseLong(blocks[i]);
 710           // position counter - ignore it
 711         } catch (NumberFormatException e)
 712         {
 713           // sequence data - append it
 714           sb.append(blocks[i]);
 715         }
 716       }
 717       line = nextLine();
 718     }
 719     this.sequenceString = sb.toString();
 720
 721     return line;
 722   }
 723
 724   /**
 725    * Processes a feature line. If it declares a feature type of interest
 726    * (currently, only CDS is processed), processes all of the associated lines
 727    * (feature qualifiers), and returns the next line after that, otherwise
 728    * simply returns the next line.
 729    *
 730    * @param line
 731    *          the first line for the feature (with initial FT omitted for EMBL
 732    *          format)
 733    * @return
 734    * @throws IOException
 735    */
 736   protected String parseFeature(String line) throws IOException
 737   {
 738     String[] tokens = line.trim().split(WHITESPACE);
 739     if (tokens.length < 2 || !"CDS".equals(tokens[0]))
 740     {
 741       return nextLine();
 742     }
 743
 744     return parseCDSFeature(tokens[1]);
 745   }
 746 }
 747
 748 /**
 749  * A data bean class to hold values parsed from one CDS Feature
 750  */
 751 class CdsData
 752 {
 753   String translation; // from /translation qualifier
 754
 755   String cdsLocation; // the raw value e.g. join(1..1234,2012..2837)
 756
 757   int codonStart = 1; // from /codon_start qualifier
 758
 759   String proteinName; // from /product qualifier; used for protein description
 760
 761   String proteinId; // from /protein_id qualifier
 762
 763   List<DBRefEntry> xrefs = new ArrayList<>(); // from /db_xref qualifiers
 764
 765   Map<String, String> cdsProps = new Hashtable<>(); // other qualifiers
 766 }