src/jalview/io/EMBLLikeFlatFile.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io;
  22
  23 import java.io.IOException;
  24 import java.text.ParseException;
  25 import java.util.ArrayList;
  26 import java.util.Arrays;
  27 import java.util.HashMap;
  28 import java.util.Hashtable;
  29 import java.util.List;
  30 import java.util.Locale;
  31 import java.util.Map;
  32 import java.util.Map.Entry;
  33 import java.util.TreeMap;
  34
  35 import jalview.bin.Console;
  36 import jalview.datamodel.DBRefEntry;
  37 import jalview.datamodel.DBRefSource;
  38 import jalview.datamodel.FeatureProperties;
  39 import jalview.datamodel.Mapping;
  40 import jalview.datamodel.Sequence;
  41 import jalview.datamodel.SequenceFeature;
  42 import jalview.datamodel.SequenceI;
  43 import jalview.util.DBRefUtils;
  44 import jalview.util.DnaUtils;
  45 import jalview.util.MapList;
  46 import jalview.util.MappingUtils;
  47
  48 /**
  49  * A base class to support parsing of GenBank, EMBL or DDBJ flat file format
  50  * data. Example files (rather than formal specifications) are provided at
  51  *
  52  * <pre>
  53  * https://ena-docs.readthedocs.io/en/latest/submit/fileprep/flat-file-example.html
  54  * https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
  55  * </pre>
  56  *
  57  * or to compare the same entry, see
  58  *
  59  * <pre>
  60  * https://www.ebi.ac.uk/ena/browser/api/embl/X81322.1
  61  * https://www.ncbi.nlm.nih.gov/nuccore/X81322.1
  62  * </pre>
  63  *
  64  * The feature table part of the file has a common definition, only the start of
  65  * each line is formatted differently in GenBank and EMBL. See
  66  * http://www.insdc.org/files/feature_table.html#7.1.
  67  */
  68 public abstract class EMBLLikeFlatFile extends AlignFile
  69 {
  70   protected static final String LOCATION = "location";
  71
  72   protected static final String QUOTE = "\"";
  73
  74   protected static final String DOUBLED_QUOTE = QUOTE + QUOTE;
  75
  76   protected static final String WHITESPACE = "\\s+";
  77
  78   /**
  79    * Removes leading or trailing double quotes (") unless doubled, and changes
  80    * any 'escaped' (doubled) double quotes to single characters. As per the
  81    * Feature Table specification for Qualifiers, Free Text.
  82    *
  83    * @param value
  84    * @return
  85    */
  86   protected static String removeQuotes(String value)
  87   {
  88     if (value == null)
  89     {
  90       return null;
  91     }
  92     if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE))
  93     {
  94       value = value.substring(1);
  95     }
  96     if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE))
  97     {
  98       value = value.substring(0, value.length() - 1);
  99     }
 100     value = value.replace(DOUBLED_QUOTE, QUOTE);
 101     return value;
 102   }
 103
 104   /**
 105    * Truncates (if necessary) the exon intervals to match 3 times the length of
 106    * the protein(including truncation for stop codon included in exon)
 107    *
 108    * @param proteinLength
 109    * @param exon
 110    *          an array of [start, end, start, end...] intervals
 111    * @return the same array (if unchanged) or a truncated copy
 112    */
 113   protected static int[] adjustForProteinLength(int proteinLength,
 114           int[] exon)
 115   {
 116     if (proteinLength <= 0 || exon == null)
 117     {
 118       return exon;
 119     }
 120     int expectedCdsLength = proteinLength * 3;
 121     int exonLength = MappingUtils.getLength(Arrays.asList(exon));
 122
 123     /*
 124      * if exon length matches protein, or is shorter, then leave it unchanged
 125      */
 126     if (expectedCdsLength >= exonLength)
 127     {
 128       return exon;
 129     }
 130
 131     int origxon[];
 132     int sxpos = -1;
 133     int endxon = 0;
 134     origxon = new int[exon.length];
 135     System.arraycopy(exon, 0, origxon, 0, exon.length);
 136     int cdspos = 0;
 137     for (int x = 0; x < exon.length; x += 2)
 138     {
 139       cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
 140       if (expectedCdsLength <= cdspos)
 141       {
 142         // advanced beyond last codon.
 143         sxpos = x;
 144         if (expectedCdsLength != cdspos)
 145         {
 146           // System.err
 147           // .println("Truncating final exon interval on region by "
 148           // + (cdspos - cdslength));
 149         }
 150
 151         /*
 152          * shrink the final exon - reduce end position if forward
 153          * strand, increase it if reverse
 154          */
 155         if (exon[x + 1] >= exon[x])
 156         {
 157           endxon = exon[x + 1] - cdspos + expectedCdsLength;
 158         }
 159         else
 160         {
 161           endxon = exon[x + 1] + cdspos - expectedCdsLength;
 162         }
 163         break;
 164       }
 165     }
 166
 167     if (sxpos != -1)
 168     {
 169       // and trim the exon interval set if necessary
 170       int[] nxon = new int[sxpos + 2];
 171       System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
 172       nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
 173                                 // set
 174       exon = nxon;
 175     }
 176     return exon;
 177   }
 178
 179   /*
 180    * when true, interpret the mol_type 'source' feature attribute
 181    * and generate an RNA sequence from the DNA record
 182    */
 183   protected boolean produceRna = true;
 184
 185   /*
 186    * values parsed from the data file
 187    */
 188   protected String sourceDb;
 189
 190   protected String accession;
 191
 192   protected String version;
 193
 194   protected String description;
 195
 196   protected int length = 128;
 197
 198   protected List<DBRefEntry> dbrefs;
 199
 200   protected boolean sequenceStringIsRNA = false;
 201
 202   protected String sequenceString;
 203
 204   protected Map<String, CdsData> cds;
 205
 206   /**
 207    * Constructor
 208    *
 209    * @param fp
 210    * @param sourceId
 211    * @throws IOException
 212    */
 213   public EMBLLikeFlatFile(FileParse fp, String sourceId) throws IOException
 214   {
 215     super(false, fp); // don't parse immediately
 216     this.sourceDb = sourceId;
 217     dbrefs = new ArrayList<>();
 218
 219     /*
 220      * using TreeMap gives CDS sequences in alphabetical, so readable, order
 221      */
 222     cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
 223
 224     parse();
 225   }
 226
 227   /**
 228    * process attributes for 'source' until the next FT feature entry only
 229    * interested in 'mol_type'
 230    *
 231    * @param tokens
 232    * @return
 233    * @throws IOException
 234    */
 235   private String parseSourceQualifiers(String[] tokens) throws IOException
 236   {
 237     if (!"source".equals(tokens[0]))
 238     {
 239       throw (new RuntimeException("Not given a 'source' qualifier line"));
 240     }
 241     // search for mol_type attribute
 242
 243     StringBuilder sb = new StringBuilder().append(tokens[1]); // extent of
 244                                                               // sequence
 245
 246     String line = parseFeatureQualifier(sb, false);
 247     while (line != null)
 248     {
 249       if (!line.startsWith("FT    ")) // four spaces, end of this feature table
 250                                       // entry
 251       {
 252         return line;
 253       }
 254
 255       // case sensitive ?
 256       int p = line.indexOf("\\mol_type");
 257       int qs = line.indexOf("\"", p);
 258       int qe = line.indexOf("\"", qs + 1);
 259       String qualifier = line.substring(qs, qe).toLowerCase(Locale.ROOT);
 260       if (qualifier.indexOf("rna") > -1)
 261       {
 262         sequenceStringIsRNA = true;
 263       }
 264       if (qualifier.indexOf("dna") > -1)
 265       {
 266         sequenceStringIsRNA = false;
 267       }
 268       line = parseFeatureQualifier(sb, false);
 269     }
 270     return line;
 271   }
 272
 273   /**
 274    * Parses one (GenBank or EMBL format) CDS feature, saves the parsed data, and
 275    * returns the next line
 276    *
 277    * @param location
 278    * @return
 279    * @throws IOException
 280    */
 281   protected String parseCDSFeature(String location) throws IOException
 282   {
 283     String line;
 284
 285     /*
 286      * parse location, which can be over >1 line e.g. EAW51554
 287      */
 288     CdsData data = new CdsData();
 289     StringBuilder sb = new StringBuilder().append(location);
 290     line = parseFeatureQualifier(sb, false);
 291     data.cdsLocation = sb.toString();
 292
 293     while (line != null)
 294     {
 295       if (!isFeatureContinuationLine(line))
 296       {
 297         // e.g. start of next feature "FT source..."
 298         break;
 299       }
 300
 301       /*
 302        * extract qualifier, e.g. FT    /protein_id="CAA37824.1"
 303        * - the value may extend over more than one line
 304        * - if the value has enclosing quotes, these are removed
 305        * - escaped double quotes ("") are reduced to a single character
 306        */
 307       int slashPos = line.indexOf('/');
 308       if (slashPos == -1)
 309       {
 310         Console.error("Unexpected EMBL line ignored: " + line);
 311         line = nextLine();
 312         continue;
 313       }
 314       int eqPos = line.indexOf('=', slashPos + 1);
 315       if (eqPos == -1)
 316       {
 317         // can happen, e.g. /ribosomal_slippage
 318         line = nextLine();
 319         continue;
 320       }
 321       String qualifier = line.substring(slashPos + 1, eqPos);
 322       String value = line.substring(eqPos + 1);
 323       value = removeQuotes(value);
 324       sb = new StringBuilder().append(value);
 325       boolean asText = !"translation".equals(qualifier);
 326       line = parseFeatureQualifier(sb, asText);
 327       String featureValue = sb.toString();
 328
 329       if ("protein_id".equals(qualifier))
 330       {
 331         data.proteinId = featureValue;
 332       }
 333       else if ("codon_start".equals(qualifier))
 334       {
 335         try
 336         {
 337           data.codonStart = Integer.parseInt(featureValue.trim());
 338         } catch (NumberFormatException e)
 339         {
 340           Console.error("Invalid codon_start in XML for " + this.accession
 341                   + ": " + e.getMessage());
 342         }
 343       }
 344       else if ("db_xref".equals(qualifier))
 345       {
 346         String[] parts = featureValue.split(":");
 347         if (parts.length == 2)
 348         {
 349           String db = parts[0].trim();
 350           db = DBRefUtils.getCanonicalName(db);
 351           DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
 352           data.xrefs.add(dbref);
 353         }
 354       }
 355       else if ("product".equals(qualifier))
 356       {
 357         data.proteinName = featureValue;
 358       }
 359       else if ("translation".equals(qualifier))
 360       {
 361         data.translation = featureValue;
 362       }
 363       else if (!"".equals(featureValue))
 364       {
 365         // throw anything else into the additional properties hash
 366         data.cdsProps.put(qualifier, featureValue);
 367       }
 368     }
 369
 370     if (data.proteinId != null)
 371     {
 372       this.cds.put(data.proteinId, data);
 373     }
 374     else
 375     {
 376       Console.error("Ignoring CDS feature with no protein_id for "
 377               + sourceDb + ":" + accession);
 378     }
 379
 380     return line;
 381   }
 382
 383   protected abstract boolean isFeatureContinuationLine(String line);
 384
 385   /**
 386    * Output (print) is not (yet) implemented for flat file format
 387    */
 388   @Override
 389   public String print(SequenceI[] seqs, boolean jvsuffix)
 390   {
 391     return null;
 392   }
 393
 394   /**
 395    * Constructs and saves the sequence from parsed components
 396    */
 397   protected void buildSequence()
 398   {
 399     if (this.accession == null || this.sequenceString == null)
 400     {
 401       Console.error("Failed to parse data from EMBL");
 402       return;
 403     }
 404
 405     String name = this.accession;
 406     if (this.sourceDb != null)
 407     {
 408       name = this.sourceDb + "|" + name;
 409     }
 410
 411     if (produceRna && sequenceStringIsRNA)
 412     {
 413       sequenceString = sequenceString.replace('T', 'U').replace('t', 'u');
 414     }
 415
 416     SequenceI seq = new Sequence(name, this.sequenceString);
 417     seq.setDescription(this.description);
 418
 419     /*
 420      * add a DBRef to itself
 421      */
 422     DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
 423     int[] startEnd = new int[] { 1, seq.getLength() };
 424     selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
 425     seq.addDBRef(selfRef);
 426
 427     for (DBRefEntry dbref : this.dbrefs)
 428     {
 429       seq.addDBRef(dbref);
 430     }
 431
 432     processCDSFeatures(seq);
 433
 434     seq.deriveSequence();
 435
 436     addSequence(seq);
 437   }
 438
 439   /**
 440    * Process the CDS features, including generation of cross-references and
 441    * mappings to the protein products (translation)
 442    *
 443    * @param seq
 444    */
 445   protected void processCDSFeatures(SequenceI seq)
 446   {
 447     /*
 448      * record protein products found to avoid duplication i.e. >1 CDS with
 449      * the same /protein_id [though not sure I can find an example of this]
 450      */
 451     Map<String, SequenceI> proteins = new HashMap<>();
 452     for (CdsData data : cds.values())
 453     {
 454       processCDSFeature(seq, data, proteins);
 455     }
 456   }
 457
 458   /**
 459    * Processes data for one parsed CDS feature to
 460    * <ul>
 461    * <li>create a protein product sequence for the translation</li>
 462    * <li>create a cross-reference to protein with mapping from dna</li>
 463    * <li>add a CDS feature to the sequence for each CDS start-end range</li>
 464    * <li>add any CDS dbrefs to the sequence and to the protein product</li>
 465    * </ul>
 466    *
 467    * @param SequenceI
 468    *          dna
 469    * @param proteins
 470    *          map of protein products so far derived from CDS data
 471    */
 472   void processCDSFeature(SequenceI dna, CdsData data,
 473           Map<String, SequenceI> proteins)
 474   {
 475     /*
 476      * parse location into a list of [start, end, start, end] positions
 477      */
 478     int[] exons = getCdsRanges(this.accession, data.cdsLocation);
 479
 480     MapList maplist = buildMappingToProtein(dna, exons, data);
 481
 482     int exonNumber = 0;
 483
 484     for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
 485     {
 486       int exonStart = exons[xint];
 487       int exonEnd = exons[xint + 1];
 488       int begin = Math.min(exonStart, exonEnd);
 489       int end = Math.max(exonStart, exonEnd);
 490       exonNumber++;
 491       String desc = String.format("Exon %d for protein EMBLCDS:%s",
 492               exonNumber, data.proteinId);
 493
 494       SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
 495               this.sourceDb);
 496       for (Entry<String, String> val : data.cdsProps.entrySet())
 497       {
 498         sf.setValue(val.getKey(), val.getValue());
 499       }
 500
 501       sf.setEnaLocation(data.cdsLocation);
 502       boolean forwardStrand = exonStart <= exonEnd;
 503       sf.setStrand(forwardStrand ? "+" : "-");
 504       sf.setPhase(String.valueOf(data.codonStart - 1));
 505       sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 506       sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
 507
 508       dna.addSequenceFeature(sf);
 509     }
 510
 511     boolean hasUniprotDbref = false;
 512     for (DBRefEntry xref : data.xrefs)
 513     {
 514       dna.addDBRef(xref);
 515       if (xref.getSource().equals(DBRefSource.UNIPROT))
 516       {
 517         /*
 518          * construct (or find) the sequence for (data.protein_id, data.translation)
 519          */
 520         SequenceI protein = buildProteinProduct(dna, xref, data, proteins);
 521         Mapping map = new Mapping(protein, maplist);
 522         map.setMappedFromId(data.proteinId);
 523         xref.setMap(map);
 524
 525         /*
 526          * add DBRefs with mappings from dna to protein and the inverse
 527          */
 528         DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession);
 529         db1.setMap(new Mapping(dna, maplist.getInverse()));
 530         protein.addDBRef(db1);
 531
 532         hasUniprotDbref = true;
 533       }
 534     }
 535
 536     /*
 537      * if we have a product (translation) but no explicit Uniprot dbref
 538      * (example: EMBL M19487 protein_id AAB02592.1)
 539      * then construct mappings to an assumed EMBLCDSPROTEIN accession
 540      */
 541     if (!hasUniprotDbref)
 542     {
 543       SequenceI protein = proteins.get(data.proteinId);
 544       if (protein == null)
 545       {
 546         protein = new Sequence(data.proteinId, data.translation);
 547         protein.setDescription(data.proteinName);
 548         proteins.put(data.proteinId, protein);
 549       }
 550       // assuming CDSPROTEIN sequence version = dna version (?!)
 551       DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct,
 552               this.version, data.proteinId);
 553       protein.addDBRef(db1);
 554
 555       DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
 556               DBRefSource.EMBLCDSProduct, this.version, data.proteinId);
 557       Mapping map = new Mapping(protein, maplist);
 558       map.setMappedFromId(data.proteinId);
 559       dnaToEmblProteinRef.setMap(map);
 560       dna.addDBRef(dnaToEmblProteinRef);
 561     }
 562
 563     /*
 564      * comment brought forward from EmblXmlSource, lines 447-451:
 565      * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL
 566      * sequence with the exon  map; if given a dataset reference, search
 567      * dataset for parent EMBL sequence if it exists and set its map;
 568      * make a new feature annotating the coding contig
 569      */
 570   }
 571
 572   /**
 573    * Computes a mapping from CDS positions in DNA sequence to protein product
 574    * positions, with allowance for stop codon or incomplete start codon
 575    *
 576    * @param dna
 577    * @param exons
 578    * @param data
 579    * @return
 580    */
 581   MapList buildMappingToProtein(final SequenceI dna, final int[] exons,
 582           final CdsData data)
 583   {
 584     MapList dnaToProteinMapping = null;
 585     int peptideLength = data.translation.length();
 586
 587     int[] proteinRange = new int[] { 1, peptideLength };
 588     if (exons != null && exons.length > 0)
 589     {
 590       /*
 591        * We were able to parse 'location'; do a final
 592        * product length truncation check
 593        */
 594       int[] cdsRanges = adjustForProteinLength(peptideLength, exons);
 595       dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 596     }
 597     else
 598     {
 599       /*
 600        * workaround until we handle all 'location' formats fully
 601        * e.g. X53828.1:60..1058 or <123..>289
 602        */
 603       Console.error(String.format(
 604               "Implementation Notice: EMBLCDS location '%s'not properly supported yet"
 605                       + " - Making up the CDNA region of (%s:%s)... may be incorrect",
 606               data.cdsLocation, sourceDb, this.accession));
 607
 608       int completeCodonsLength = 1 - data.codonStart + dna.getLength();
 609       int mappedDnaEnd = dna.getEnd();
 610       if (peptideLength * 3 == completeCodonsLength)
 611       {
 612         // this might occur for CDS sequences where no features are marked
 613         Console.warn("Assuming no stop codon at end of cDNA fragment");
 614         mappedDnaEnd = dna.getEnd();
 615       }
 616       else if ((peptideLength + 1) * 3 == completeCodonsLength)
 617       {
 618         Console.warn("Assuming stop codon at end of cDNA fragment");
 619         mappedDnaEnd = dna.getEnd() - 3;
 620       }
 621
 622       if (mappedDnaEnd != -1)
 623       {
 624         int[] cdsRanges = new int[] {
 625             dna.getStart() + (data.codonStart - 1), mappedDnaEnd };
 626         dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
 627       }
 628     }
 629
 630     return dnaToProteinMapping;
 631   }
 632
 633   /**
 634    * Constructs a sequence for the protein product for the CDS data (if there is
 635    * one), and dbrefs with mappings from CDS to protein and the reverse
 636    *
 637    * @param dna
 638    * @param xref
 639    * @param data
 640    * @param proteins
 641    * @return
 642    */
 643   SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref,
 644           CdsData data, Map<String, SequenceI> proteins)
 645   {
 646     /*
 647      * check we have some data to work with
 648      */
 649     if (data.proteinId == null || data.translation == null)
 650     {
 651       return null;
 652     }
 653
 654     /*
 655      * Construct the protein sequence (if not already seen)
 656      */
 657     String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId();
 658     SequenceI protein = proteins.get(proteinSeqName);
 659     if (protein == null)
 660     {
 661       protein = new Sequence(proteinSeqName, data.translation, 1,
 662               data.translation.length());
 663       protein.setDescription(data.proteinName != null ? data.proteinName
 664               : "Protein Product from " + sourceDb);
 665       proteins.put(proteinSeqName, protein);
 666     }
 667
 668     return protein;
 669   }
 670
 671   /**
 672    * Returns the CDS location as a single array of [start, end, start, end...]
 673    * positions. If on the reverse strand, these will be in descending order.
 674    *
 675    * @param accession
 676    * @param location
 677    * @return
 678    */
 679   protected int[] getCdsRanges(String accession, String location)
 680   {
 681     if (location == null)
 682     {
 683       return new int[] {};
 684     }
 685
 686     try
 687     {
 688       List<int[]> ranges = DnaUtils.parseLocation(location);
 689       return MappingUtils.rangeListToArray(ranges);
 690     } catch (ParseException e)
 691     {
 692       Console.warn(
 693               String.format("Not parsing inexact CDS location %s in ENA %s",
 694                       location, accession));
 695       return new int[] {};
 696     }
 697   }
 698
 699   /**
 700    * Reads the value of a feature (FT) qualifier from one or more lines of the
 701    * file, and returns the next line after that. Values are appended to the
 702    * string buffer, which should be already primed with the value read from the
 703    * first line for the qualifier (with any leading double quote removed).
 704    * Enclosing double quotes are removed, and escaped (repeated) double quotes
 705    * reduced to one only. For example for
 706    *
 707    * <pre>
 708    * FT      /note="gene_id=hCG28070.3
 709    * FT      ""foobar"" isoform=CRA_b"
 710    * the returned value is
 711    * gene_id=hCG28070.3 "foobar" isoform=CRA_b
 712    * </pre>
 713    *
 714    * Note the side-effect of this method, to advance data reading to the next
 715    * line after the feature qualifier (which could be another qualifier, a
 716    * different feature, a non-feature line, or null at end of file).
 717    *
 718    * @param sb
 719    *          a string buffer primed with the first line of the value
 720    * @param asText
 721    * @return
 722    * @throws IOException
 723    */
 724   String parseFeatureQualifier(StringBuilder sb, boolean asText)
 725           throws IOException
 726   {
 727     String line;
 728     while ((line = nextLine()) != null)
 729     {
 730       if (!isFeatureContinuationLine(line))
 731       {
 732         break; // reached next feature or other input line
 733       }
 734       String[] tokens = line.split(WHITESPACE);
 735       if (tokens.length < 2)
 736       {
 737         Console.error("Ignoring bad EMBL line for " + this.accession + ": "
 738                 + line);
 739         break;
 740       }
 741       if (tokens[1].startsWith("/"))
 742       {
 743         break; // next feature qualifier
 744       }
 745
 746       /*
 747        * if text (e.g. /product), add a word separator for a new line,
 748        * else (e.g. /translation) don't
 749        */
 750       if (asText)
 751       {
 752         sb.append(" ");
 753       }
 754
 755       /*
 756        * remove trailing " and unescape doubled ""
 757        */
 758       String data = removeQuotes(tokens[1]);
 759       sb.append(data);
 760     }
 761
 762     return line;
 763   }
 764
 765   /**
 766    * Reads and saves the sequence, read from the lines following the ORIGIN
 767    * (GenBank) or SQ (EMBL) line. Whitespace and position counters are
 768    * discarded. Returns the next line following the sequence data (the next line
 769    * that doesn't start with whitespace).
 770    *
 771    * @throws IOException
 772    */
 773   protected String parseSequence() throws IOException
 774   {
 775     StringBuilder sb = new StringBuilder(this.length);
 776     String line = nextLine();
 777     while (line != null && line.startsWith(" "))
 778     {
 779       line = line.trim();
 780       String[] blocks = line.split(WHITESPACE);
 781
 782       /*
 783        * the first or last block on each line might be a position count - omit
 784        */
 785       for (int i = 0; i < blocks.length; i++)
 786       {
 787         try
 788         {
 789           Long.parseLong(blocks[i]);
 790           // position counter - ignore it
 791         } catch (NumberFormatException e)
 792         {
 793           // sequence data - append it
 794           sb.append(blocks[i]);
 795         }
 796       }
 797       line = nextLine();
 798     }
 799     this.sequenceString = sb.toString();
 800
 801     return line;
 802   }
 803
 804   /**
 805    * Processes a feature line. If it declares a feature type of interest
 806    * (currently, only CDS is processed), processes all of the associated lines
 807    * (feature qualifiers), and returns the next line after that, otherwise
 808    * simply returns the next line.
 809    *
 810    * @param line
 811    *          the first line for the feature (with initial FT omitted for EMBL
 812    *          format)
 813    * @return
 814    * @throws IOException
 815    */
 816   protected String parseFeature(String line) throws IOException
 817   {
 818     String[] tokens = line.trim().split(WHITESPACE);
 819     if (tokens.length < 2
 820             || (!"CDS".equals(tokens[0]) && (!"source".equals(tokens[0]))))
 821     {
 822       return nextLine();
 823     }
 824     if (tokens[0].equals("source"))
 825     {
 826       return parseSourceQualifiers(tokens);
 827     }
 828     return parseCDSFeature(tokens[1]);
 829   }
 830 }
 831
 832 /**
 833  * A data bean class to hold values parsed from one CDS Feature
 834  */
 835 class CdsData
 836 {
 837   String translation; // from /translation qualifier
 838
 839   String cdsLocation; // the raw value e.g. join(1..1234,2012..2837)
 840
 841   int codonStart = 1; // from /codon_start qualifier
 842
 843   String proteinName; // from /product qualifier; used for protein description
 844
 845   String proteinId; // from /protein_id qualifier
 846
 847   List<DBRefEntry> xrefs = new ArrayList<>(); // from /db_xref qualifiers
 848
 849   Map<String, String> cdsProps = new Hashtable<>(); // other qualifiers
 850 }