src/jalview/io/EmblFlatFile.java

   1 package jalview.io;
   2
   3 import java.io.IOException;
   4 import java.text.ParseException;
   5 import java.util.ArrayList;
   6 import java.util.HashMap;
   7 import java.util.Hashtable;
   8 import java.util.List;
   9 import java.util.Map;
  10 import java.util.Map.Entry;
  11
  12 import jalview.bin.Cache;
  13 import jalview.datamodel.DBRefEntry;
  14 import jalview.datamodel.FeatureProperties;
  15 import jalview.datamodel.Mapping;
  16 import jalview.datamodel.Sequence;
  17 import jalview.datamodel.SequenceFeature;
  18 import jalview.datamodel.SequenceI;
  19 import jalview.util.DBRefUtils;
  20 import jalview.util.DnaUtils;
  21 import jalview.util.MappingUtils;
  22
  23 /**
  24  * A class that provides selective parsing of the EMBL flatfile format.
  25  * <p>
  26  * The initial implementation is limited to extracting fields used by Jalview
  27  * after fetching an EMBL or EMBLCDS entry:
  28  *
  29  * <pre>
  30  * accession, version, sequence, xref
  31  * and (for CDS feature) location, protein_id, product, codon_start, translation
  32  * </pre>
  33  *
  34  * For a complete parser, it may be best to adopt that provided in
  35  * https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile
  36  * (but note this has a dependency on the Apache Commons library)
  37  *
  38  * @author gmcarstairs
  39  * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt
  40  * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
  41  */
  42 public class EmblFlatFile extends AlignFile // FileParse
  43 {
  44   /**
  45    * A data bean class to hold values parsed from one CDS Feature (FT)
  46    */
  47   class CdsData
  48   {
  49     String translation; // from CDS feature /translation
  50
  51     String cdsLocation; // CDS /location raw value
  52
  53     int codonStart = 1; // from CDS /codon_start
  54
  55     String proteinName; // from CDS /product; used for protein description
  56
  57     String proteinId; // from CDS /protein_id
  58
  59     Map<String, String> cdsProps = new Hashtable<>(); // CDS other qualifiers
  60   }
  61
  62   private static final String WHITESPACE = "\\s+";
  63
  64   private String sourceDb;
  65
  66   /*
  67    * values parsed from the EMBL flatfile record
  68    */
  69   private String accession; // from ID (first token)
  70
  71   private String version; // from ID (second token)
  72
  73   private String description; // from (first) DE line
  74
  75   private int length = 128; // from ID (7th token), with usable default
  76
  77   private List<DBRefEntry> dbrefs; // from DR and also CDS /db_xref qualifiers
  78
  79   private String sequenceString; // from SQ lines
  80
  81   private List<CdsData> cds;
  82
  83   /**
  84    * Constructor
  85    *
  86    * @param fp
  87    * @param sourceId
  88    * @throws IOException
  89    */
  90   public EmblFlatFile(FileParse fp, String sourceId) throws IOException
  91   {
  92     super(false, fp); // don't parse immediately
  93     this.sourceDb = sourceId;
  94     dbrefs = new ArrayList<>();
  95     cds = new ArrayList<>();
  96   }
  97
  98   /**
  99    * Parses the flatfile, and if successful, saves as an annotated sequence
 100    * which may be retrieved by calling {@code getSequence()}
 101    *
 102    * @throws IOException
 103    */
 104   public void parse() throws IOException
 105   {
 106     String line = nextLine();
 107     while (line != null)
 108     {
 109       if (line.startsWith("ID"))
 110       {
 111         line = parseID(line);
 112       }
 113       else if (line.startsWith("DE"))
 114       {
 115         line = parseDE(line);
 116       }
 117       else if (line.startsWith("DR"))
 118       {
 119         line = parseDR(line);
 120       }
 121       else if (line.startsWith("SQ"))
 122       {
 123         line = parseSQ();
 124       }
 125       else if (line.startsWith("FT"))
 126       {
 127         line = parseFT(line);
 128       }
 129       else
 130       {
 131         line = nextLine();
 132       }
 133     }
 134     assembleSequence();
 135   }
 136
 137   /**
 138    * Extracts and saves the primary accession and version (SV value) from an ID
 139    * line, or null if not found. Returns the next line after the one processed.
 140    *
 141    * @param line
 142    * @throws IOException
 143    */
 144   String parseID(String line) throws IOException
 145   {
 146     String[] tokens = line.substring(2).split(";");
 147
 148     /*
 149      * first is primary accession
 150      */
 151     String token = tokens[0].trim();
 152     if (!token.isEmpty())
 153     {
 154       this.accession = token;
 155     }
 156
 157     /*
 158      * second token is 'SV versionNo'
 159      */
 160     if (tokens.length > 1)
 161     {
 162       token = tokens[1].trim();
 163       if (token.startsWith("SV"))
 164       {
 165         String[] bits = token.trim().split(WHITESPACE);
 166         this.version = bits[bits.length - 1];
 167       }
 168     }
 169
 170     /*
 171      * seventh token is 'length BP'
 172      */
 173     if (tokens.length > 6)
 174     {
 175       token = tokens[6].trim();
 176       String[] bits = token.trim().split(WHITESPACE);
 177       try
 178       {
 179         this.length = Integer.valueOf(bits[0]);
 180       } catch (NumberFormatException e)
 181       {
 182         Cache.log.error("bad length read in flatfile, line: " + line);
 183       }
 184     }
 185
 186     return nextLine();
 187   }
 188
 189   /**
 190    * Reads sequence description from the first DE line found. Any trailing
 191    * period is discarded. If there are multiple DE lines, only the first (short
 192    * description) is read, the rest are ignored.
 193    *
 194    * @param line
 195    * @return
 196    * @throws IOException
 197    */
 198   String parseDE(String line) throws IOException
 199   {
 200     String desc = line.substring(2).trim();
 201     if (desc.endsWith("."))
 202     {
 203       desc = desc.substring(0, desc.length() - 1);
 204     }
 205     this.description = desc;
 206
 207     /*
 208      * pass over any additional DE lines
 209      */
 210     while ((line = nextLine()) != null)
 211     {
 212       if (!line.startsWith("DE"))
 213       {
 214         break;
 215       }
 216     }
 217
 218     return line;
 219   }
 220
 221   /**
 222    * Processes one DR line and saves as a DBRefEntry cross-reference. Returns
 223    * the line following the line processed.
 224    *
 225    * @param line
 226    * @throws IOException
 227    */
 228   String parseDR(String line) throws IOException
 229   {
 230     String[] tokens = line.substring(2).split(";");
 231     if (tokens.length > 1)
 232     {
 233       /*
 234        * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 235        */
 236       String db = tokens[0].trim();
 237       db = DBRefUtils.getCanonicalName(db);
 238       String acc = tokens[1].trim();
 239       if (acc.endsWith("."))
 240       {
 241         acc = acc.substring(0, acc.length() - 1);
 242       }
 243       String version = "0";
 244       if (tokens.length > 2)
 245       {
 246         String secondaryId = tokens[2].trim();
 247         if (!secondaryId.isEmpty())
 248         {
 249           // todo: is this right? secondary id is not a version number
 250           // version = secondaryId;
 251         }
 252       }
 253       this.dbrefs.add(new DBRefEntry(db, version, acc));
 254     }
 255
 256     return nextLine();
 257   }
 258
 259   /**
 260    * Reads and saves the sequence, read from the lines following the SQ line.
 261    * Whitespace and position counters are discarded. Returns the next line
 262    * following the sequence data (the next line that doesn't start with
 263    * whitespace).
 264    *
 265    * @throws IOException
 266    */
 267   String parseSQ() throws IOException
 268   {
 269     StringBuilder sb = new StringBuilder(this.length);
 270     String line = nextLine();
 271     while (line != null && line.startsWith(" "))
 272     {
 273       line = line.trim();
 274       String[] blocks = line.split(WHITESPACE);
 275
 276       /*
 277        * omit the last block (position counter) on each line
 278        */
 279       for (int i = 0; i < blocks.length - 1; i++)
 280       {
 281         sb.append(blocks[i]);
 282       }
 283       line = nextLine();
 284     }
 285     this.sequenceString = sb.toString();
 286
 287     return line;
 288   }
 289
 290   /**
 291    * Processes an FT line. If it declares a feature type of interest (currently,
 292    * only CDS is processed), processes all of the associated lines (feature
 293    * qualifiers), and returns the next line after that, otherwise simply returns
 294    * the next line.
 295    *
 296    * @param line
 297    * @return
 298    * @throws IOException
 299    */
 300   String parseFT(String line) throws IOException
 301   {
 302     String[] tokens = line.split(WHITESPACE);
 303     if (tokens.length < 3 || !"CDS".equals(tokens[1]))
 304     {
 305       return nextLine();
 306     }
 307
 308     CdsData data = new CdsData();
 309     data.cdsLocation = tokens[2];
 310
 311     line = nextLine();
 312     while (line != null)
 313     {
 314       if (!line.startsWith("FT    ")) // 4 spaces
 315       {
 316         // e.g. start of next feature "FT source..."
 317         break;
 318       }
 319
 320       /*
 321        * extract qualifier, e.g. FT    /protein_id="CAA37824.1"
 322        */
 323       int slashPos = line.indexOf('/');
 324       if (slashPos == -1)
 325       {
 326         Cache.log.error("Unexpected EMBL line ignored: " + line);
 327         continue;
 328       }
 329       int eqPos = line.indexOf('=', slashPos + 1);
 330       if (eqPos == -1)
 331       {
 332         Cache.log.error("Unexpected EMBL line ignored: " + line);
 333         continue;
 334       }
 335       String qualifier = line.substring(slashPos + 1, eqPos);
 336       String value = line.substring(eqPos + 1);
 337       if (value.startsWith("\"") && value.endsWith("\""))
 338       {
 339         value = value.substring(1, value.length() - 1);
 340       }
 341
 342       if ("protein_id".equals(qualifier))
 343       {
 344         data.proteinId = value;
 345         line = nextLine();
 346       }
 347       else if ("codon_start".equals(qualifier))
 348       {
 349         try
 350         {
 351           data.codonStart = Integer.parseInt(value.trim());
 352         } catch (NumberFormatException e)
 353         {
 354           Cache.log.error("Invalid codon_start in XML for " + this.accession
 355                   + ": " + e.getMessage());
 356         }
 357         line = nextLine();
 358       }
 359       else if ("db_xref".equals(qualifier))
 360       {
 361         String[] parts = value.split(":");
 362         if (parts.length == 2)
 363         {
 364           String db = parts[0].trim();
 365           db = DBRefUtils.getCanonicalName(db);
 366           DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
 367           this.dbrefs.add(dbref);
 368         }
 369         line = nextLine();
 370       }
 371       else if ("product".equals(qualifier))
 372       {
 373         // sometimes name is returned e.g. for V00488
 374         data.proteinName = value;
 375         line = nextLine();
 376       }
 377       else if ("translation".equals(qualifier))
 378       {
 379         line = readTranslation(value, data);
 380       }
 381       else if (!"".equals(value))
 382       {
 383         // throw anything else into the additional properties hash
 384         data.cdsProps.put(qualifier, value);
 385         line = nextLine();
 386       }
 387     }
 388
 389     this.cds.add(data);
 390
 391     return line;
 392   }
 393
 394   /**
 395    * Reads and returns the CDS translation from one or more lines of the file,
 396    * and returns the next line after that
 397    *
 398    * @param value
 399    *          the first line of the translation (likely quoted)
 400    * @param data
 401    * @return
 402    * @throws IOException
 403    */
 404   String readTranslation(String value, CdsData data) throws IOException
 405   {
 406     StringBuilder sb = new StringBuilder(this.length / 3 + 1);
 407     sb.append(value.replace("\"", ""));
 408
 409     String line;
 410     while ((line = nextLine()) != null)
 411     {
 412       if (!line.startsWith("FT    "))
 413       {
 414         break; // reached next feature or other input line
 415       }
 416       String[] tokens = line.split(WHITESPACE);
 417       if (tokens.length < 2)
 418       {
 419         Cache.log.error("Ignoring bad EMBL line: " + line);
 420         break;
 421       }
 422       if (tokens[1].startsWith("/"))
 423       {
 424         break; // next feature qualifier
 425       }
 426       sb.append(tokens[1].replace("\"", ""));
 427     }
 428
 429     data.translation = sb.toString();
 430
 431     return line;
 432   }
 433
 434   /**
 435    * Constructs and saves the sequence from parsed components
 436    */
 437   void assembleSequence()
 438   {
 439     String name = this.accession;
 440     if (this.sourceDb != null)
 441     {
 442       name = this.sourceDb + "|" + name;
 443     }
 444     SequenceI seq = new Sequence(name, this.sequenceString);
 445     seq.setDescription(this.description);
 446
 447     /*
 448      * add a DBRef to itself
 449      */
 450     DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
 451     int[] startEnd = new int[] { 1, seq.getLength() };
 452     selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
 453     seq.addDBRef(selfRef);
 454
 455     for (DBRefEntry dbref : this.dbrefs)
 456     {
 457       seq.addDBRef(dbref);
 458     }
 459
 460     processAllCDS(seq);
 461
 462     seq.deriveSequence();
 463
 464     addSequence(seq);
 465   }
 466
 467   /**
 468    * Process the CDS features, including generation of cross-references and
 469    * mappings to the protein products (translation)
 470    *
 471    * @param seq
 472    */
 473   protected void processAllCDS(SequenceI seq)
 474   {
 475     /*
 476      * record protein products found to avoid duplication i.e. >1 CDS with
 477      * the same /protein_id [though not sure I can find an example of this]
 478      */
 479     Map<String, SequenceI> proteins = new HashMap<>();
 480     for (CdsData data : cds)
 481     {
 482       processOneCDS(seq, data, proteins);
 483     }
 484   }
 485
 486   /**
 487    * Processes the parsed CDS feature data to
 488    * <ul>
 489    * <li>add a CDS feature to the sequence for each CDS start-end range</li>
 490    * <li>create a protein product sequence for the translation</li>
 491    * <li>create a cross-reference to protein with mapping from dna</li>
 492    * <li>add any CDS dbrefs to the sequence and to the protein product</li>
 493    * </ul>
 494    *
 495    * @param SequenceI
 496    *          dna
 497    * @param proteins
 498    *          map of protein products so far derived from CDS data
 499    */
 500   void processOneCDS(SequenceI dna, CdsData data,
 501           Map<String, SequenceI> proteins)
 502   {
 503     /*
 504      * parse location into a list of [start, end, start, end] positions
 505      */
 506     int[] exons = getCdsRanges(this.accession, data.cdsLocation);
 507     int exonNumber = 0;
 508
 509     for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
 510     {
 511       int exonStart = exons[xint];
 512       int exonEnd = exons[xint + 1];
 513       int begin = Math.min(exonStart, exonEnd);
 514       int end = Math.max(exonStart, exonEnd);
 515       exonNumber++;
 516       String desc = String.format("Exon %d for protein EMBLCDS:%s",
 517               exonNumber, data.proteinId);
 518
 519       SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
 520               this.sourceDb);
 521       for (Entry<String, String> val : data.cdsProps.entrySet())
 522       {
 523         sf.setValue(val.getKey(), val.getValue());
 524       }
 525
 526       sf.setEnaLocation(data.cdsLocation);
 527       boolean forwardStrand = exonStart <= exonEnd;
 528       sf.setStrand(forwardStrand ? "+" : "-");
 529       sf.setPhase(String.valueOf(data.codonStart - 1));
 530       sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 531       sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
 532
 533       dna.addSequenceFeature(sf);
 534
 535       linkProteinProduct(dna, data, proteins);
 536     }
 537   }
 538
 539   /**
 540    * Constructs a sequence for the protein product for the CDS data (if there is
 541    * one), and dbrefs with mappings from CDS to protein and the reverse
 542    *
 543    * @param dna
 544    * @param data
 545    * @param proteins
 546    */
 547   void linkProteinProduct(SequenceI dna, CdsData data, Map<String, SequenceI> proteins)
 548   {
 549     /*
 550      * check we have some data to work with
 551      */
 552     if (data.proteinId == null || data.translation == null)
 553     {
 554       return;
 555     }
 556
 557     /*
 558      * Construct the protein sequence (if not already seen)
 559      */
 560     SequenceI protein = proteins.get(data.proteinId);
 561     if (protein == null)
 562     {
 563       protein = new Sequence(data.proteinId, data.translation, 1,
 564               data.translation.length());
 565       protein.setDescription(data.proteinName != null ? data.proteinName
 566               : "Protein Product from " + sourceDb);
 567       proteins.put(data.proteinId, protein);
 568     }
 569   }
 570
 571   /**
 572    * Returns the CDS location as a single array of [start, end, start, end...]
 573    * positions. If on the reverse strand, these will be in descending order.
 574    *
 575    * @param accession
 576    * @param location
 577    * @return
 578    */
 579   protected int[] getCdsRanges(String accession, String location)
 580   {
 581     if (location == null)
 582     {
 583       return new int[] {};
 584     }
 585
 586     try
 587     {
 588       List<int[]> ranges = DnaUtils.parseLocation(location);
 589       return MappingUtils.listToArray(ranges);
 590     } catch (ParseException e)
 591     {
 592       Cache.log.warn(
 593               String.format("Not parsing inexact CDS location %s in ENA %s",
 594                       location, accession));
 595       return new int[] {};
 596     }
 597   }
 598
 599   /**
 600    * Output (print) is not implemented for EMBL flat file format
 601    */
 602   @Override
 603   public String print(SequenceI[] seqs, boolean jvsuffix)
 604   {
 605     return null;
 606   }
 607 }