src/jalview/io/EMBLLikeFlatFile.java

   1 package jalview.io;
   2
   3 import java.io.IOException;
   4 import java.text.ParseException;
   5 import java.util.ArrayList;
   6 import java.util.Arrays;
   7 import java.util.HashMap;
   8 import java.util.Hashtable;
   9 import java.util.List;
  10 import java.util.Locale;
  11 import java.util.Map;
  12 import java.util.Map.Entry;
  13 import java.util.TreeMap;
  14
  15 import jalview.bin.Console;
  16 import jalview.datamodel.DBRefEntry;
  17 import jalview.datamodel.DBRefSource;
  18 import jalview.datamodel.FeatureProperties;
  19 import jalview.datamodel.Mapping;
  20 import jalview.datamodel.Sequence;
  21 import jalview.datamodel.SequenceFeature;
  22 import jalview.datamodel.SequenceI;
  23 import jalview.util.DBRefUtils;
  24 import jalview.util.DnaUtils;
  25 import jalview.util.MapList;
  26 import jalview.util.MappingUtils;
  27
  28 /**
  29  * A base class to support parsing of GenBank, EMBL or DDBJ flat file format
  30  * data. Example files (rather than formal specifications) are provided at
  31  *
  32  * <pre>
  33  * https://ena-docs.readthedocs.io/en/latest/submit/fileprep/flat-file-example.html
  34  * https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
  35  * </pre>
  36  *
  37  * or to compare the same entry, see
  38  *
  39  * <pre>
  40  * https://www.ebi.ac.uk/ena/browser/api/embl/X81322.1
  41  * https://www.ncbi.nlm.nih.gov/nuccore/X81322.1
  42  * </pre>
  43  *
  44  * The feature table part of the file has a common definition, only the start of
  45  * each line is formatted differently in GenBank and EMBL. See
  46  * http://www.insdc.org/files/feature_table.html#7.1.
  47  */
  48 public abstract class EMBLLikeFlatFile extends AlignFile
  49 {
  50   protected static final String LOCATION = "location";
  51
  52   protected static final String QUOTE = "\"";
  53
  54   protected static final String DOUBLED_QUOTE = QUOTE + QUOTE;
  55
  56   protected static final String WHITESPACE = "\\s+";
  57
  58   /**
  59    * Removes leading or trailing double quotes (") unless doubled, and changes
  60    * any 'escaped' (doubled) double quotes to single characters. As per the
  61    * Feature Table specification for Qualifiers, Free Text.
  62    *
  63    * @param value
  64    * @return
  65    */
  66   protected static String removeQuotes(String value)
  67   {
  68     if (value == null)
  69     {
  70       return null;
  71     }
  72     if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE))
  73     {
  74       value = value.substring(1);
  75     }
  76     if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE))
  77     {
  78       value = value.substring(0, value.length() - 1);
  79     }
  80     value = value.replace(DOUBLED_QUOTE, QUOTE);
  81     return value;
  82   }
  83
  84   /**
  85    * Truncates (if necessary) the exon intervals to match 3 times the length of
  86    * the protein(including truncation for stop codon included in exon)
  87    *
  88    * @param proteinLength
  89    * @param exon
  90    *          an array of [start, end, start, end...] intervals
  91    * @return the same array (if unchanged) or a truncated copy
  92    */
  93   protected static int[] adjustForProteinLength(int proteinLength,
  94           int[] exon)
  95   {
  96     if (proteinLength <= 0 || exon == null)
  97     {
  98       return exon;
  99     }
 100     int expectedCdsLength = proteinLength * 3;
 101     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 102
 103     /*
 104      * if exon length matches protein, or is shorter, then leave it unchanged
 105      */
 106     if (expectedCdsLength >= exonLength)
 107     {
 108       return exon;
 109     }
 110
 111     int origxon[];
 112     int sxpos = -1;
 113     int endxon = 0;
 114     origxon = new int[exon.length];
 115     System.arraycopy(exon, 0, origxon, 0, exon.length);
 116     int cdspos = 0;
 117     for (int x = 0; x < exon.length; x += 2)
 118     {
 119       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 120       if (expectedCdsLength <= cdspos)
 121       {
 122         // advanced beyond last codon.
 123         sxpos = x;
 124         if (expectedCdsLength != cdspos)
 125         {
 126           // System.err
 127           // .println("Truncating final exon interval on region by "
 128           // + (cdspos - cdslength));
 129         }
 130
 131         /*
 132          * shrink the final exon - reduce end position if forward
 133          * strand, increase it if reverse
 134          */
 135         if (exon[x + 1] >= exon[x])
 136         {
 137           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 138         }
 139         else
 140         {
 141           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 142         }
 143         break;
 144       }
 145     }
 146
 147     if (sxpos != -1)
 148     {
 149       // and trim the exon interval set if necessary
 150       int[] nxon = new int[sxpos + 2];
 151       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 152       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 153                                 // set
 154       exon = nxon;
 155     }
 156     return exon;
 157   }
 158
 159   /*
 160    * when true, interpret the mol_type 'source' feature attribute
 161    * and generate an RNA sequence from the DNA record
 162    */
 163   protected boolean produceRna = true;
 164
 165   /*
 166    * values parsed from the data file
 167    */
 168   protected String sourceDb;
 169
 170   protected String accession;
 171
 172   protected String version;
 173
 174   protected String description;
 175
 176   protected int length = 128;
 177
 178   protected List<DBRefEntry> dbrefs;
 179
 180   protected boolean sequenceStringIsRNA = false;
 181
 182   protected String sequenceString;
 183
 184   protected Map<String, CdsData> cds;
 185
 186   /**
 187    * Constructor
 188    *
 189    * @param fp
 190    * @param sourceId
 191    * @throws IOException
 192    */
 193   public EMBLLikeFlatFile(FileParse fp, String sourceId) throws IOException
 194   {
 195     super(false, fp); // don't parse immediately
 196     this.sourceDb = sourceId;
 197     dbrefs = new ArrayList<>();
 198
 199     /*
 200      * using TreeMap gives CDS sequences in alphabetical, so readable, order
 201      */
 202     cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
 203
 204     parse();
 205   }
 206
 207   /**
 208    * process attributes for 'source' until the next FT feature entry only
 209    * interested in 'mol_type'
 210    *
 211    * @param tokens
 212    * @return
 213    * @throws IOException
 214    */
 215   private String parseSourceQualifiers(String[] tokens) throws IOException
 216   {
 217     if (!"source".equals(tokens[0]))
 218     {
 219       throw (new RuntimeException("Not given a 'source' qualifier line"));
 220     }
 221     // search for mol_type attribute
 222
 223     StringBuilder sb = new StringBuilder().append(tokens[1]); // extent of
 224                                                               // sequence
 225
 226     String line = parseFeatureQualifier(sb, false);
 227     while (line != null)
 228     {
 229       if (!line.startsWith("FT    ")) // four spaces, end of this feature table
 230                                       // entry
 231       {
 232         return line;
 233       }
 234
 235       // case sensitive ?
 236       int p = line.indexOf("\\mol_type");
 237       int qs = line.indexOf("\"", p);
 238       int qe = line.indexOf("\"", qs + 1);
 239       String qualifier = line.substring(qs, qe).toLowerCase(Locale.ROOT);
 240       if (qualifier.indexOf("rna") > -1)
 241       {
 242         sequenceStringIsRNA = true;
 243       }
 244       if (qualifier.indexOf("dna") > -1)
 245       {
 246         sequenceStringIsRNA = false;
 247       }
 248       line = parseFeatureQualifier(sb, false);
 249     }
 250     return line;
 251   }
 252
 253   /**
 254    * Parses one (GenBank or EMBL format) CDS feature, saves the parsed data, and
 255    * returns the next line
 256    *
 257    * @param location
 258    * @return
 259    * @throws IOException
 260    */
 261   protected String parseCDSFeature(String location) throws IOException
 262   {
 263     String line;
 264
 265     /*
 266      * parse location, which can be over >1 line e.g. EAW51554
 267      */
 268     CdsData data = new CdsData();
 269     StringBuilder sb = new StringBuilder().append(location);
 270     line = parseFeatureQualifier(sb, false);
 271     data.cdsLocation = sb.toString();
 272
 273     while (line != null)
 274     {
 275       if (!isFeatureContinuationLine(line))
 276       {
 277         // e.g. start of next feature "FT source..."
 278         break;
 279       }
 280
 281       /*
 282        * extract qualifier, e.g. FT    /protein_id="CAA37824.1"
 283        * - the value may extend over more than one line
 284        * - if the value has enclosing quotes, these are removed
 285        * - escaped double quotes ("") are reduced to a single character
 286        */
 287       int slashPos = line.indexOf('/');
 288       if (slashPos == -1)
 289       {
 290         Console.error("Unexpected EMBL line ignored: " + line);
 291         line = nextLine();
 292         continue;
 293       }
 294       int eqPos = line.indexOf('=', slashPos + 1);
 295       if (eqPos == -1)
 296       {
 297         // can happen, e.g. /ribosomal_slippage
 298         line = nextLine();
 299         continue;
 300       }
 301       String qualifier = line.substring(slashPos + 1, eqPos);
 302       String value = line.substring(eqPos + 1);
 303       value = removeQuotes(value);
 304       sb = new StringBuilder().append(value);
 305       boolean asText = !"translation".equals(qualifier);
 306       line = parseFeatureQualifier(sb, asText);
 307       String featureValue = sb.toString();
 308
 309       if ("protein_id".equals(qualifier))
 310       {
 311         data.proteinId = featureValue;
 312       }
 313       else if ("codon_start".equals(qualifier))
 314       {
 315         try
 316         {
 317           data.codonStart = Integer.parseInt(featureValue.trim());
 318         } catch (NumberFormatException e)
 319         {
 320           Console.error("Invalid codon_start in XML for " + this.accession
 321                   + ": " + e.getMessage());
 322         }
 323       }
 324       else if ("db_xref".equals(qualifier))
 325       {
 326         String[] parts = featureValue.split(":");
 327         if (parts.length == 2)
 328         {
 329           String db = parts[0].trim();
 330           db = DBRefUtils.getCanonicalName(db);
 331           DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
 332           data.xrefs.add(dbref);
 333         }
 334       }
 335       else if ("product".equals(qualifier))
 336       {
 337         data.proteinName = featureValue;
 338       }
 339       else if ("translation".equals(qualifier))
 340       {
 341         data.translation = featureValue;
 342       }
 343       else if (!"".equals(featureValue))
 344       {
 345         // throw anything else into the additional properties hash
 346         data.cdsProps.put(qualifier, featureValue);
 347       }
 348     }
 349
 350     if (data.proteinId != null)
 351     {
 352       this.cds.put(data.proteinId, data);
 353     }
 354     else
 355     {
 356       Console.error("Ignoring CDS feature with no protein_id for "
 357               + sourceDb + ":" + accession);
 358     }
 359
 360     return line;
 361   }
 362
 363   protected abstract boolean isFeatureContinuationLine(String line);
 364
 365   /**
 366    * Output (print) is not (yet) implemented for flat file format
 367    */
 368   @Override
 369   public String print(SequenceI[] seqs, boolean jvsuffix)
 370   {
 371     return null;
 372   }
 373
 374   /**
 375    * Constructs and saves the sequence from parsed components
 376    */
 377   protected void buildSequence()
 378   {
 379     if (this.accession == null || this.sequenceString == null)
 380     {
 381       Console.error("Failed to parse data from EMBL");
 382       return;
 383     }
 384
 385     String name = this.accession;
 386     if (this.sourceDb != null)
 387     {
 388       name = this.sourceDb + "|" + name;
 389     }
 390
 391     if (produceRna && sequenceStringIsRNA)
 392     {
 393       sequenceString = sequenceString.replace('T', 'U').replace('t', 'u');
 394     }
 395
 396     SequenceI seq = new Sequence(name, this.sequenceString);
 397     seq.setDescription(this.description);
 398
 399     /*
 400      * add a DBRef to itself
 401      */
 402     DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
 403     int[] startEnd = new int[] { 1, seq.getLength() };
 404     selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
 405     seq.addDBRef(selfRef);
 406
 407     for (DBRefEntry dbref : this.dbrefs)
 408     {
 409       seq.addDBRef(dbref);
 410     }
 411
 412     processCDSFeatures(seq);
 413
 414     seq.deriveSequence();
 415
 416     addSequence(seq);
 417   }
 418
 419   /**
 420    * Process the CDS features, including generation of cross-references and
 421    * mappings to the protein products (translation)
 422    *
 423    * @param seq
 424    */
 425   protected void processCDSFeatures(SequenceI seq)
 426   {
 427     /*
 428      * record protein products found to avoid duplication i.e. >1 CDS with
 429      * the same /protein_id [though not sure I can find an example of this]
 430      */
 431     Map<String, SequenceI> proteins = new HashMap<>();
 432     for (CdsData data : cds.values())
 433     {
 434       processCDSFeature(seq, data, proteins);
 435     }
 436   }
 437
 438   /**
 439    * Processes data for one parsed CDS feature to
 440    * <ul>
 441    * <li>create a protein product sequence for the translation</li>
 442    * <li>create a cross-reference to protein with mapping from dna</li>
 443    * <li>add a CDS feature to the sequence for each CDS start-end range</li>
 444    * <li>add any CDS dbrefs to the sequence and to the protein product</li>
 445    * </ul>
 446    *
 447    * @param SequenceI
 448    *          dna
 449    * @param proteins
 450    *          map of protein products so far derived from CDS data
 451    */
 452   void processCDSFeature(SequenceI dna, CdsData data,
 453           Map<String, SequenceI> proteins)
 454   {
 455     /*
 456      * parse location into a list of [start, end, start, end] positions
 457      */
 458     int[] exons = getCdsRanges(this.accession, data.cdsLocation);
 459
 460     MapList maplist = buildMappingToProtein(dna, exons, data);
 461
 462     int exonNumber = 0;
 463
 464     for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
 465     {
 466       int exonStart = exons[xint];
 467       int exonEnd = exons[xint + 1];
 468       int begin = Math.min(exonStart, exonEnd);
 469       int end = Math.max(exonStart, exonEnd);
 470       exonNumber++;
 471       String desc = String.format("Exon %d for protein EMBLCDS:%s",
 472               exonNumber, data.proteinId);
 473
 474       SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
 475               this.sourceDb);
 476       for (Entry<String, String> val : data.cdsProps.entrySet())
 477       {
 478         sf.setValue(val.getKey(), val.getValue());
 479       }
 480
 481       sf.setEnaLocation(data.cdsLocation);
 482       boolean forwardStrand = exonStart <= exonEnd;
 483       sf.setStrand(forwardStrand ? "+" : "-");
 484       sf.setPhase(String.valueOf(data.codonStart - 1));
 485       sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 486       sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
 487
 488       dna.addSequenceFeature(sf);
 489     }
 490
 491     boolean hasUniprotDbref = false;
 492     for (DBRefEntry xref : data.xrefs)
 493     {
 494       dna.addDBRef(xref);
 495       if (xref.getSource().equals(DBRefSource.UNIPROT))
 496       {
 497         /*
 498          * construct (or find) the sequence for (data.protein_id, data.translation)
 499          */
 500         SequenceI protein = buildProteinProduct(dna, xref, data, proteins);
 501         Mapping map = new Mapping(protein, maplist);
 502         map.setMappedFromId(data.proteinId);
 503         xref.setMap(map);
 504
 505         /*
 506          * add DBRefs with mappings from dna to protein and the inverse
 507          */
 508         DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession);
 509         db1.setMap(new Mapping(dna, maplist.getInverse()));
 510         protein.addDBRef(db1);
 511
 512         hasUniprotDbref = true;
 513       }
 514     }
 515
 516     /*
 517      * if we have a product (translation) but no explicit Uniprot dbref
 518      * (example: EMBL M19487 protein_id AAB02592.1)
 519      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 520      */
 521     if (!hasUniprotDbref)
 522     {
 523       SequenceI protein = proteins.get(data.proteinId);
 524       if (protein == null)
 525       {
 526         protein = new Sequence(data.proteinId, data.translation);
 527         protein.setDescription(data.proteinName);
 528         proteins.put(data.proteinId, protein);
 529       }
 530       // assuming CDSPROTEIN sequence version = dna version (?!)
 531       DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 532               this.version, data.proteinId);
 533       protein.addDBRef(db1);
 534
 535       DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 536               DBRefSource.EMBLCDSProduct, this.version, data.proteinId);
 537       Mapping map = new Mapping(protein, maplist);
 538       map.setMappedFromId(data.proteinId);
 539       dnaToEmblProteinRef.setMap(map);
 540       dna.addDBRef(dnaToEmblProteinRef);
 541     }
 542
 543     /*
 544      * comment brought forward from EmblXmlSource, lines 447-451:
 545      * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL
 546      * sequence with the exon  map; if given a dataset reference, search
 547      * dataset for parent EMBL sequence if it exists and set its map;
 548      * make a new feature annotating the coding contig
 549      */
 550   }
 551
 552   /**
 553    * Computes a mapping from CDS positions in DNA sequence to protein product
 554    * positions, with allowance for stop codon or incomplete start codon
 555    *
 556    * @param dna
 557    * @param exons
 558    * @param data
 559    * @return
 560    */
 561   MapList buildMappingToProtein(final SequenceI dna, final int[] exons,
 562           final CdsData data)
 563   {
 564     MapList dnaToProteinMapping = null;
 565     int peptideLength = data.translation.length();
 566
 567     int[] proteinRange = new int[] { 1, peptideLength };
 568     if (exons != null && exons.length > 0)
 569     {
 570       /*
 571        * We were able to parse 'location'; do a final
 572        * product length truncation check
 573        */
 574       int[] cdsRanges = adjustForProteinLength(peptideLength, exons);
 575       dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 576     }
 577     else
 578     {
 579       /*
 580        * workaround until we handle all 'location' formats fully
 581        * e.g. X53828.1:60..1058 or <123..>289
 582        */
 583       Console.error(String.format(
 584               "Implementation Notice: EMBLCDS location '%s'not properly supported yet"
 585                       + " - Making up the CDNA region of (%s:%s)... may be incorrect",
 586               data.cdsLocation, sourceDb, this.accession));
 587
 588       int completeCodonsLength = 1 - data.codonStart + dna.getLength();
 589       int mappedDnaEnd = dna.getEnd();
 590       if (peptideLength * 3 == completeCodonsLength)
 591       {
 592         // this might occur for CDS sequences where no features are marked
 593         Console.warn("Assuming no stop codon at end of cDNA fragment");
 594         mappedDnaEnd = dna.getEnd();
 595       }
 596       else if ((peptideLength + 1) * 3 == completeCodonsLength)
 597       {
 598         Console.warn("Assuming stop codon at end of cDNA fragment");
 599         mappedDnaEnd = dna.getEnd() - 3;
 600       }
 601
 602       if (mappedDnaEnd != -1)
 603       {
 604         int[] cdsRanges = new int[] {
 605             dna.getStart() + (data.codonStart - 1), mappedDnaEnd };
 606         dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 607       }
 608     }
 609
 610     return dnaToProteinMapping;
 611   }
 612
 613   /**
 614    * Constructs a sequence for the protein product for the CDS data (if there is
 615    * one), and dbrefs with mappings from CDS to protein and the reverse
 616    *
 617    * @param dna
 618    * @param xref
 619    * @param data
 620    * @param proteins
 621    * @return
 622    */
 623   SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref,
 624           CdsData data, Map<String, SequenceI> proteins)
 625   {
 626     /*
 627      * check we have some data to work with
 628      */
 629     if (data.proteinId == null || data.translation == null)
 630     {
 631       return null;
 632     }
 633
 634     /*
 635      * Construct the protein sequence (if not already seen)
 636      */
 637     String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId();
 638     SequenceI protein = proteins.get(proteinSeqName);
 639     if (protein == null)
 640     {
 641       protein = new Sequence(proteinSeqName, data.translation, 1,
 642               data.translation.length());
 643       protein.setDescription(data.proteinName != null ? data.proteinName
 644               : "Protein Product from " + sourceDb);
 645       proteins.put(proteinSeqName, protein);
 646     }
 647
 648     return protein;
 649   }
 650
 651   /**
 652    * Returns the CDS location as a single array of [start, end, start, end...]
 653    * positions. If on the reverse strand, these will be in descending order.
 654    *
 655    * @param accession
 656    * @param location
 657    * @return
 658    */
 659   protected int[] getCdsRanges(String accession, String location)
 660   {
 661     if (location == null)
 662     {
 663       return new int[] {};
 664     }
 665
 666     try
 667     {
 668       List<int[]> ranges = DnaUtils.parseLocation(location);
 669       return MappingUtils.rangeListToArray(ranges);
 670     } catch (ParseException e)
 671     {
 672       Console.warn(
 673               String.format("Not parsing inexact CDS location %s in ENA %s",
 674                       location, accession));
 675       return new int[] {};
 676     }
 677   }
 678
 679   /**
 680    * Reads the value of a feature (FT) qualifier from one or more lines of the
 681    * file, and returns the next line after that. Values are appended to the
 682    * string buffer, which should be already primed with the value read from the
 683    * first line for the qualifier (with any leading double quote removed).
 684    * Enclosing double quotes are removed, and escaped (repeated) double quotes
 685    * reduced to one only. For example for
 686    *
 687    * <pre>
 688    * FT      /note="gene_id=hCG28070.3
 689    * FT      ""foobar"" isoform=CRA_b"
 690    * the returned value is
 691    * gene_id=hCG28070.3 "foobar" isoform=CRA_b
 692    * </pre>
 693    *
 694    * Note the side-effect of this method, to advance data reading to the next
 695    * line after the feature qualifier (which could be another qualifier, a
 696    * different feature, a non-feature line, or null at end of file).
 697    *
 698    * @param sb
 699    *          a string buffer primed with the first line of the value
 700    * @param asText
 701    * @return
 702    * @throws IOException
 703    */
 704   String parseFeatureQualifier(StringBuilder sb, boolean asText)
 705           throws IOException
 706   {
 707     String line;
 708     while ((line = nextLine()) != null)
 709     {
 710       if (!isFeatureContinuationLine(line))
 711       {
 712         break; // reached next feature or other input line
 713       }
 714       String[] tokens = line.split(WHITESPACE);
 715       if (tokens.length < 2)
 716       {
 717         Console.error("Ignoring bad EMBL line for " + this.accession + ": "
 718                 + line);
 719         break;
 720       }
 721       if (tokens[1].startsWith("/"))
 722       {
 723         break; // next feature qualifier
 724       }
 725
 726       /*
 727        * if text (e.g. /product), add a word separator for a new line,
 728        * else (e.g. /translation) don't
 729        */
 730       if (asText)
 731       {
 732         sb.append(" ");
 733       }
 734
 735       /*
 736        * remove trailing " and unescape doubled ""
 737        */
 738       String data = removeQuotes(tokens[1]);
 739       sb.append(data);
 740     }
 741
 742     return line;
 743   }
 744
 745   /**
 746    * Reads and saves the sequence, read from the lines following the ORIGIN
 747    * (GenBank) or SQ (EMBL) line. Whitespace and position counters are
 748    * discarded. Returns the next line following the sequence data (the next line
 749    * that doesn't start with whitespace).
 750    *
 751    * @throws IOException
 752    */
 753   protected String parseSequence() throws IOException
 754   {
 755     StringBuilder sb = new StringBuilder(this.length);
 756     String line = nextLine();
 757     while (line != null && line.startsWith(" "))
 758     {
 759       line = line.trim();
 760       String[] blocks = line.split(WHITESPACE);
 761
 762       /*
 763        * the first or last block on each line might be a position count - omit
 764        */
 765       for (int i = 0; i < blocks.length; i++)
 766       {
 767         try
 768         {
 769           Long.parseLong(blocks[i]);
 770           // position counter - ignore it
 771         } catch (NumberFormatException e)
 772         {
 773           // sequence data - append it
 774           sb.append(blocks[i]);
 775         }
 776       }
 777       line = nextLine();
 778     }
 779     this.sequenceString = sb.toString();
 780
 781     return line;
 782   }
 783
 784   /**
 785    * Processes a feature line. If it declares a feature type of interest
 786    * (currently, only CDS is processed), processes all of the associated lines
 787    * (feature qualifiers), and returns the next line after that, otherwise
 788    * simply returns the next line.
 789    *
 790    * @param line
 791    *          the first line for the feature (with initial FT omitted for EMBL
 792    *          format)
 793    * @return
 794    * @throws IOException
 795    */
 796   protected String parseFeature(String line) throws IOException
 797   {
 798     String[] tokens = line.trim().split(WHITESPACE);
 799     if (tokens.length < 2
 800             || (!"CDS".equals(tokens[0]) && (!"source".equals(tokens[0]))))
 801     {
 802       return nextLine();
 803     }
 804     if (tokens[0].equals("source"))
 805     {
 806       return parseSourceQualifiers(tokens);
 807     }
 808     return parseCDSFeature(tokens[1]);
 809   }
 810 }
 811
 812 /**
 813  * A data bean class to hold values parsed from one CDS Feature
 814  */
 815 class CdsData
 816 {
 817   String translation; // from /translation qualifier
 818
 819   String cdsLocation; // the raw value e.g. join(1..1234,2012..2837)
 820
 821   int codonStart = 1; // from /codon_start qualifier
 822
 823   String proteinName; // from /product qualifier; used for protein description
 824
 825   String proteinId; // from /protein_id qualifier
 826
 827   List<DBRefEntry> xrefs = new ArrayList<>(); // from /db_xref qualifiers
 828
 829   Map<String, String> cdsProps = new Hashtable<>(); // other qualifiers
 830 }