src/jalview/io/FlatFile.java

   1 package jalview.io;
   2
   3 import java.io.IOException;
   4 import java.text.ParseException;
   5 import java.util.ArrayList;
   6 import java.util.Arrays;
   7 import java.util.HashMap;
   8 import java.util.Hashtable;
   9 import java.util.List;
  10 import java.util.Map;
  11 import java.util.Map.Entry;
  12 import java.util.TreeMap;
  13
  14 import jalview.bin.Cache;
  15 import jalview.datamodel.DBRefEntry;
  16 import jalview.datamodel.DBRefSource;
  17 import jalview.datamodel.FeatureProperties;
  18 import jalview.datamodel.Mapping;
  19 import jalview.datamodel.Sequence;
  20 import jalview.datamodel.SequenceFeature;
  21 import jalview.datamodel.SequenceI;
  22 import jalview.util.DBRefUtils;
  23 import jalview.util.DnaUtils;
  24 import jalview.util.MapList;
  25 import jalview.util.MappingUtils;
  26
  27 /**
  28  * A base class to support parsing of GenBank, EMBL or DDBJ flat file format
  29  * data. Example files (rather than formal specifications) are provided at
  30  *
  31  * <pre>
  32  * https://ena-docs.readthedocs.io/en/latest/submit/fileprep/flat-file-example.html
  33  * https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
  34  * </pre>
  35  *
  36  * or to compare the same entry, see
  37  *
  38  * <pre>
  39  * https://www.ebi.ac.uk/ena/browser/api/embl/X81322.1
  40  * https://www.ncbi.nlm.nih.gov/nuccore/X81322.1
  41  * </pre>
  42  *
  43  * The feature table part of the file has a common definition, only the start of
  44  * each line is formatted differently in GenBank and EMBL. See
  45  * http://www.insdc.org/files/feature_table.html#7.1.
  46  */
  47 public abstract class FlatFile extends AlignFile
  48 {
  49   protected static final String LOCATION = "location";
  50
  51   protected static final String QUOTE = "\"";
  52
  53   protected static final String DOUBLED_QUOTE = QUOTE + QUOTE;
  54
  55   protected static final String WHITESPACE = "\\s+";
  56
  57   /**
  58    * Removes leading or trailing double quotes (") unless doubled, and changes
  59    * any 'escaped' (doubled) double quotes to single characters. As per the
  60    * Feature Table specification for Qualifiers, Free Text.
  61    *
  62    * @param value
  63    * @return
  64    */
  65   protected static String removeQuotes(String value)
  66   {
  67     if (value == null)
  68     {
  69       return null;
  70     }
  71     if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE))
  72     {
  73       value = value.substring(1);
  74     }
  75     if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE))
  76     {
  77       value = value.substring(0, value.length() - 1);
  78     }
  79     value = value.replace(DOUBLED_QUOTE, QUOTE);
  80     return value;
  81   }
  82
  83   /**
  84    * Truncates (if necessary) the exon intervals to match 3 times the length of
  85    * the protein; also accepts 3 bases longer (for stop codon not included in
  86    * protein)
  87    *
  88    * @param proteinLength
  89    * @param exon
  90    *          an array of [start, end, start, end...] intervals
  91    * @return the same array (if unchanged) or a truncated copy
  92    */
  93   protected static int[] adjustForProteinLength(int proteinLength,
  94           int[] exon)
  95   {
  96     if (proteinLength <= 0 || exon == null)
  97     {
  98       return exon;
  99     }
 100     int expectedCdsLength = proteinLength * 3;
 101     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 102
 103     /*
 104      * if exon length matches protein, or is shorter, or longer by the
 105      * length of a stop codon (3 bases), then leave it unchanged
 106      */
 107     if (expectedCdsLength >= exonLength
 108             || expectedCdsLength == exonLength - 3)
 109     {
 110       return exon;
 111     }
 112
 113     int origxon[];
 114     int sxpos = -1;
 115     int endxon = 0;
 116     origxon = new int[exon.length];
 117     System.arraycopy(exon, 0, origxon, 0, exon.length);
 118     int cdspos = 0;
 119     for (int x = 0; x < exon.length; x += 2)
 120     {
 121       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 122       if (expectedCdsLength <= cdspos)
 123       {
 124         // advanced beyond last codon.
 125         sxpos = x;
 126         if (expectedCdsLength != cdspos)
 127         {
 128           // System.err
 129           // .println("Truncating final exon interval on region by "
 130           // + (cdspos - cdslength));
 131         }
 132
 133         /*
 134          * shrink the final exon - reduce end position if forward
 135          * strand, increase it if reverse
 136          */
 137         if (exon[x + 1] >= exon[x])
 138         {
 139           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 140         }
 141         else
 142         {
 143           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 144         }
 145         break;
 146       }
 147     }
 148
 149     if (sxpos != -1)
 150     {
 151       // and trim the exon interval set if necessary
 152       int[] nxon = new int[sxpos + 2];
 153       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 154       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 155                                 // set
 156       exon = nxon;
 157     }
 158     return exon;
 159   }
 160
 161   /*
 162    * values parsed from the data file
 163    */
 164   protected String sourceDb;
 165
 166   protected String accession;
 167
 168   protected String version;
 169
 170   protected String description;
 171
 172   protected int length = 128;
 173
 174   protected List<DBRefEntry> dbrefs;
 175
 176   protected String sequenceString;
 177
 178   protected Map<String, CdsData> cds;
 179
 180   /**
 181    * Constructor
 182    *
 183    * @param fp
 184    * @param sourceId
 185    * @throws IOException
 186    */
 187   public FlatFile(FileParse fp, String sourceId) throws IOException
 188   {
 189     super(false, fp); // don't parse immediately
 190     this.sourceDb = sourceId;
 191     dbrefs = new ArrayList<>();
 192
 193     /*
 194      * using TreeMap gives CDS sequences in alphabetical, so readable, order
 195      */
 196     cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
 197
 198     parse();
 199   }
 200
 201   /**
 202    * Parses one (GenBank or EMBL format) CDS feature, saves the parsed data, and
 203    * returns the next line
 204    *
 205    * @param location
 206    * @return
 207    * @throws IOException
 208    */
 209   protected String parseCDSFeature(String location) throws IOException
 210   {
 211     String line;
 212
 213     /*
 214      * parse location, which can be over >1 line e.g. EAW51554
 215      */
 216     CdsData data = new CdsData();
 217     StringBuilder sb = new StringBuilder().append(location);
 218     line = parseFeatureQualifier(sb, LOCATION);
 219     data.cdsLocation = sb.toString();
 220
 221     while (line != null)
 222     {
 223       if (!isFeatureContinuationLine(line))
 224       {
 225         // e.g. start of next feature "FT source..."
 226         break;
 227       }
 228
 229       /*
 230        * extract qualifier, e.g. FT    /protein_id="CAA37824.1"
 231        * - the value may extend over more than one line
 232        * - if the value has enclosing quotes, these are removed
 233        * - escaped double quotes ("") are reduced to a single character
 234        */
 235       int slashPos = line.indexOf('/');
 236       if (slashPos == -1)
 237       {
 238         Cache.log.error("Unexpected EMBL line ignored: " + line);
 239         line = nextLine();
 240         continue;
 241       }
 242       int eqPos = line.indexOf('=', slashPos + 1);
 243       if (eqPos == -1)
 244       {
 245         // can happen, e.g. /ribosomal_slippage
 246         line = nextLine();
 247         continue;
 248       }
 249       String qualifier = line.substring(slashPos + 1, eqPos);
 250       String value = line.substring(eqPos + 1);
 251       value = removeQuotes(value);
 252       sb = new StringBuilder().append(value);
 253       line = parseFeatureQualifier(sb, qualifier);
 254       String featureValue = sb.toString();
 255
 256       if ("protein_id".equals(qualifier))
 257       {
 258         data.proteinId = featureValue;
 259       }
 260       else if ("codon_start".equals(qualifier))
 261       {
 262         try
 263         {
 264           data.codonStart = Integer.parseInt(featureValue.trim());
 265         } catch (NumberFormatException e)
 266         {
 267           Cache.log.error("Invalid codon_start in XML for " + this.accession
 268                   + ": " + e.getMessage());
 269         }
 270       }
 271       else if ("db_xref".equals(qualifier))
 272       {
 273         String[] parts = featureValue.split(":");
 274         if (parts.length == 2)
 275         {
 276           String db = parts[0].trim();
 277           db = DBRefUtils.getCanonicalName(db);
 278           DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
 279           data.xrefs.add(dbref);
 280         }
 281       }
 282       else if ("product".equals(qualifier))
 283       {
 284         data.proteinName = featureValue;
 285       }
 286       else if ("translation".equals(qualifier))
 287       {
 288         data.translation = featureValue;
 289       }
 290       else if (!"".equals(featureValue))
 291       {
 292         // throw anything else into the additional properties hash
 293         data.cdsProps.put(qualifier, featureValue);
 294       }
 295     }
 296
 297     if (data.proteinId != null)
 298     {
 299       this.cds.put(data.proteinId, data);
 300     }
 301     else
 302     {
 303       Cache.log.error("Ignoring CDS feature with no protein_id for "
 304               + sourceDb + ":" + accession);
 305     }
 306
 307     return line;
 308   }
 309
 310   protected abstract boolean isFeatureContinuationLine(String line);
 311
 312   /**
 313    * Output (print) is not (yet) implemented for flat file format
 314    */
 315   @Override
 316   public String print(SequenceI[] seqs, boolean jvsuffix)
 317   {
 318     return null;
 319   }
 320
 321   /**
 322    * Constructs and saves the sequence from parsed components
 323    */
 324   protected void buildSequence()
 325   {
 326     if (this.accession == null || this.sequenceString == null)
 327     {
 328       Cache.log.error("Failed to parse data from EMBL");
 329       return;
 330     }
 331
 332     String name = this.accession;
 333     if (this.sourceDb != null)
 334     {
 335       name = this.sourceDb + "|" + name;
 336     }
 337     SequenceI seq = new Sequence(name, this.sequenceString);
 338     seq.setDescription(this.description);
 339
 340     /*
 341      * add a DBRef to itself
 342      */
 343     DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
 344     int[] startEnd = new int[] { 1, seq.getLength() };
 345     selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
 346     seq.addDBRef(selfRef);
 347
 348     for (DBRefEntry dbref : this.dbrefs)
 349     {
 350       seq.addDBRef(dbref);
 351     }
 352
 353     processCDSFeatures(seq);
 354
 355     seq.deriveSequence();
 356
 357     addSequence(seq);
 358   }
 359
 360   /**
 361    * Process the CDS features, including generation of cross-references and
 362    * mappings to the protein products (translation)
 363    *
 364    * @param seq
 365    */
 366   protected void processCDSFeatures(SequenceI seq)
 367   {
 368     /*
 369      * record protein products found to avoid duplication i.e. >1 CDS with
 370      * the same /protein_id [though not sure I can find an example of this]
 371      */
 372     Map<String, SequenceI> proteins = new HashMap<>();
 373     for (CdsData data : cds.values())
 374     {
 375       processCDSFeature(seq, data, proteins);
 376     }
 377   }
 378
 379   /**
 380    * Processes data for one parsed CDS feature to
 381    * <ul>
 382    * <li>create a protein product sequence for the translation</li>
 383    * <li>create a cross-reference to protein with mapping from dna</li>
 384    * <li>add a CDS feature to the sequence for each CDS start-end range</li>
 385    * <li>add any CDS dbrefs to the sequence and to the protein product</li>
 386    * </ul>
 387    *
 388    * @param SequenceI
 389    *          dna
 390    * @param proteins
 391    *          map of protein products so far derived from CDS data
 392    */
 393   void processCDSFeature(SequenceI dna, CdsData data,
 394           Map<String, SequenceI> proteins)
 395   {
 396     /*
 397      * parse location into a list of [start, end, start, end] positions
 398      */
 399     int[] exons = getCdsRanges(this.accession, data.cdsLocation);
 400
 401     MapList maplist = buildMappingToProtein(dna, exons, data);
 402
 403     int exonNumber = 0;
 404
 405     for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
 406     {
 407       int exonStart = exons[xint];
 408       int exonEnd = exons[xint + 1];
 409       int begin = Math.min(exonStart, exonEnd);
 410       int end = Math.max(exonStart, exonEnd);
 411       exonNumber++;
 412       String desc = String.format("Exon %d for protein EMBLCDS:%s",
 413               exonNumber, data.proteinId);
 414
 415       SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
 416               this.sourceDb);
 417       for (Entry<String, String> val : data.cdsProps.entrySet())
 418       {
 419         sf.setValue(val.getKey(), val.getValue());
 420       }
 421
 422       sf.setEnaLocation(data.cdsLocation);
 423       boolean forwardStrand = exonStart <= exonEnd;
 424       sf.setStrand(forwardStrand ? "+" : "-");
 425       sf.setPhase(String.valueOf(data.codonStart - 1));
 426       sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 427       sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
 428
 429       dna.addSequenceFeature(sf);
 430     }
 431
 432     boolean hasUniprotDbref = false;
 433     for (DBRefEntry xref : data.xrefs)
 434     {
 435       dna.addDBRef(xref);
 436       if (xref.getSource().equals(DBRefSource.UNIPROT))
 437       {
 438         /*
 439          * construct (or find) the sequence for (data.protein_id, data.translation)
 440          */
 441         SequenceI protein = buildProteinProduct(dna, xref, data, proteins);
 442         Mapping map = new Mapping(protein, maplist);
 443         map.setMappedFromId(data.proteinId);
 444         xref.setMap(map);
 445
 446         /*
 447          * add DBRefs with mappings from dna to protein and the inverse
 448          */
 449         DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession);
 450         db1.setMap(new Mapping(dna, maplist.getInverse()));
 451         protein.addDBRef(db1);
 452
 453         hasUniprotDbref = true;
 454       }
 455     }
 456
 457     /*
 458      * if we have a product (translation) but no explicit Uniprot dbref
 459      * (example: EMBL M19487 protein_id AAB02592.1)
 460      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 461      */
 462     if (!hasUniprotDbref)
 463     {
 464       SequenceI protein = proteins.get(data.proteinId);
 465       if (protein == null)
 466       {
 467         protein = new Sequence(data.proteinId, data.translation);
 468         protein.setDescription(data.proteinName);
 469         proteins.put(data.proteinId, protein);
 470       }
 471       // assuming CDSPROTEIN sequence version = dna version (?!)
 472       DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 473               this.version, data.proteinId);
 474       protein.addDBRef(db1);
 475
 476       DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 477               DBRefSource.EMBLCDSProduct, this.version, data.proteinId);
 478       Mapping map = new Mapping(protein, maplist);
 479       map.setMappedFromId(data.proteinId);
 480       dnaToEmblProteinRef.setMap(map);
 481       dna.addDBRef(dnaToEmblProteinRef);
 482     }
 483
 484     /*
 485      * comment brought forward from EmblXmlSource, lines 447-451:
 486      * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL
 487      * sequence with the exon  map; if given a dataset reference, search
 488      * dataset for parent EMBL sequence if it exists and set its map;
 489      * make a new feature annotating the coding contig
 490      */
 491   }
 492
 493   /**
 494    * Computes a mapping from CDS positions in DNA sequence to protein product
 495    * positions, with allowance for stop codon or incomplete start codon
 496    *
 497    * @param dna
 498    * @param exons
 499    * @param data
 500    * @return
 501    */
 502   MapList buildMappingToProtein(final SequenceI dna, final int[] exons,
 503           final CdsData data)
 504   {
 505     MapList dnaToProteinMapping = null;
 506     int peptideLength = data.translation.length();
 507
 508     int[] proteinRange = new int[] { 1, peptideLength };
 509     if (exons != null && exons.length > 0)
 510     {
 511       /*
 512        * We were able to parse 'location'; do a final
 513        * product length truncation check
 514        */
 515       int[] cdsRanges = adjustForProteinLength(peptideLength, exons);
 516       dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 517     }
 518     else
 519     {
 520       /*
 521        * workaround until we handle all 'location' formats fully
 522        * e.g. X53828.1:60..1058 or <123..>289
 523        */
 524       Cache.log.error(String.format(
 525               "Implementation Notice: EMBLCDS location '%s'not properly supported yet"
 526                       + " - Making up the CDNA region of (%s:%s)... may be incorrect",
 527               data.cdsLocation, sourceDb, this.accession));
 528
 529       int completeCodonsLength = 1 - data.codonStart + dna.getLength();
 530       int mappedDnaEnd = dna.getEnd();
 531       if (peptideLength * 3 == completeCodonsLength)
 532       {
 533         // this might occur for CDS sequences where no features are marked
 534         Cache.log.warn("Assuming no stop codon at end of cDNA fragment");
 535         mappedDnaEnd = dna.getEnd();
 536       }
 537       else if ((peptideLength + 1) * 3 == completeCodonsLength)
 538       {
 539         Cache.log.warn("Assuming stop codon at end of cDNA fragment");
 540         mappedDnaEnd = dna.getEnd() - 3;
 541       }
 542
 543       if (mappedDnaEnd != -1)
 544       {
 545         int[] cdsRanges = new int[] {
 546             dna.getStart() + (data.codonStart - 1), mappedDnaEnd };
 547         dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 548       }
 549     }
 550
 551     return dnaToProteinMapping;
 552   }
 553
 554   /**
 555    * Constructs a sequence for the protein product for the CDS data (if there is
 556    * one), and dbrefs with mappings from CDS to protein and the reverse
 557    *
 558    * @param dna
 559    * @param xref
 560    * @param data
 561    * @param proteins
 562    * @return
 563    */
 564   SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref,
 565           CdsData data, Map<String, SequenceI> proteins)
 566   {
 567     /*
 568      * check we have some data to work with
 569      */
 570     if (data.proteinId == null || data.translation == null)
 571     {
 572       return null;
 573     }
 574
 575     /*
 576      * Construct the protein sequence (if not already seen)
 577      */
 578     String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId();
 579     SequenceI protein = proteins.get(proteinSeqName);
 580     if (protein == null)
 581     {
 582       protein = new Sequence(proteinSeqName, data.translation, 1,
 583               data.translation.length());
 584       protein.setDescription(data.proteinName != null ? data.proteinName
 585               : "Protein Product from " + sourceDb);
 586       proteins.put(proteinSeqName, protein);
 587     }
 588
 589     return protein;
 590   }
 591
 592   /**
 593    * Returns the CDS location as a single array of [start, end, start, end...]
 594    * positions. If on the reverse strand, these will be in descending order.
 595    *
 596    * @param accession
 597    * @param location
 598    * @return
 599    */
 600   protected int[] getCdsRanges(String accession, String location)
 601   {
 602     if (location == null)
 603     {
 604       return new int[] {};
 605     }
 606
 607     try
 608     {
 609       List<int[]> ranges = DnaUtils.parseLocation(location);
 610       return MappingUtils.rangeListToArray(ranges);
 611     } catch (ParseException e)
 612     {
 613       Cache.log.warn(
 614               String.format("Not parsing inexact CDS location %s in ENA %s",
 615                       location, accession));
 616       return new int[] {};
 617     }
 618   }
 619
 620   /**
 621    * Reads the value of a feature (FT) qualifier from one or more lines of the
 622    * file, and returns the next line after that. Values are appended to the
 623    * string buffer, which should be already primed with the value read from the
 624    * first line for the qualifier (with any leading double quote removed).
 625    * Enclosing double quotes are removed, and escaped (repeated) double quotes
 626    * reduced to one only. For example for
 627    *
 628    * <pre>
 629    * FT      /note="gene_id=hCG28070.3
 630    * FT      ""foobar"" isoform=CRA_b"
 631    * the returned value is
 632    * gene_id=hCG28070.3 "foobar" isoform=CRA_b
 633    * </pre>
 634    *
 635    * Note the side-effect of this method, to advance data reading to the next
 636    * line after the feature qualifier (which could be another qualifier, a
 637    * different feature, a non-feature line, or null at end of file).
 638    *
 639    * @param sb
 640    *          a string buffer primed with the first line of the value
 641    * @param qualifierName
 642    * @return
 643    * @throws IOException
 644    */
 645   String parseFeatureQualifier(StringBuilder sb, String qualifierName)
 646           throws IOException
 647   {
 648     String line;
 649     while ((line = nextLine()) != null)
 650     {
 651       if (!isFeatureContinuationLine(line))
 652       {
 653         break; // reached next feature or other input line
 654       }
 655       String[] tokens = line.split(WHITESPACE);
 656       if (tokens.length < 2)
 657       {
 658         Cache.log.error("Ignoring bad EMBL line for " + this.accession
 659                 + ": " + line);
 660         break;
 661       }
 662       if (tokens[1].startsWith("/"))
 663       {
 664         break; // next feature qualifier
 665       }
 666
 667       /*
 668        * heuristic rule: most multi-line value (e.g. /product) are text,
 669        * so add a space for word boundary at a new line; not for translation
 670        */
 671       if (!"translation".equals(qualifierName)
 672               && !LOCATION.equals(qualifierName))
 673       {
 674         sb.append(" ");
 675       }
 676
 677       /*
 678        * remove trailing " and unescape doubled ""
 679        */
 680       String data = removeQuotes(tokens[1]);
 681       sb.append(data);
 682     }
 683
 684     return line;
 685   }
 686
 687   /**
 688    * Reads and saves the sequence, read from the lines following the ORIGIN
 689    * (GenBank) or SQ (EMBL) line. Whitespace and position counters are
 690    * discarded. Returns the next line following the sequence data (the next line
 691    * that doesn't start with whitespace).
 692    *
 693    * @throws IOException
 694    */
 695   protected String parseSequence() throws IOException
 696   {
 697     StringBuilder sb = new StringBuilder(this.length);
 698     String line = nextLine();
 699     while (line != null && line.startsWith(" "))
 700     {
 701       line = line.trim();
 702       String[] blocks = line.split(WHITESPACE);
 703
 704       /*
 705        * the first or last block on each line might be a position count - omit
 706        */
 707       for (int i = 0; i < blocks.length; i++)
 708       {
 709         try
 710         {
 711           Long.parseLong(blocks[i]);
 712           // position counter - ignore it
 713         } catch (NumberFormatException e)
 714         {
 715           // sequence data - append it
 716           sb.append(blocks[i]);
 717         }
 718       }
 719       line = nextLine();
 720     }
 721     this.sequenceString = sb.toString();
 722
 723     return line;
 724   }
 725
 726   /**
 727    * Processes a feature line. If it declares a feature type of interest
 728    * (currently, only CDS is processed), processes all of the associated lines
 729    * (feature qualifiers), and returns the next line after that, otherwise
 730    * simply returns the next line.
 731    *
 732    * @param line
 733    *          the first line for the feature (with initial FT omitted for EMBL
 734    *          format)
 735    * @return
 736    * @throws IOException
 737    */
 738   protected String parseFeature(String line) throws IOException
 739   {
 740     String[] tokens = line.trim().split(WHITESPACE);
 741     if (tokens.length < 2 || !"CDS".equals(tokens[0]))
 742     {
 743       return nextLine();
 744     }
 745
 746     return parseCDSFeature(tokens[1]);
 747   }
 748 }
 749
 750 /**
 751  * A data bean class to hold values parsed from one CDS Feature
 752  */
 753 class CdsData
 754 {
 755   String translation; // from /translation qualifier
 756
 757   String cdsLocation; // the raw value e.g. join(1..1234,2012..2837)
 758
 759   int codonStart = 1; // from /codon_start qualifier
 760
 761   String proteinName; // from /product qualifier; used for protein description
 762
 763   String proteinId; // from /protein_id qualifier
 764
 765   List<DBRefEntry> xrefs = new ArrayList<>(); // from /db_xref qualifiers
 766
 767   Map<String, String> cdsProps = new Hashtable<>(); // other qualifiers
 768 }