src/jalview/io/EmblFlatFile.java

   1 package jalview.io;
   2
   3 import java.io.IOException;
   4 import java.text.ParseException;
   5 import java.util.ArrayList;
   6 import java.util.Hashtable;
   7 import java.util.List;
   8 import java.util.Map;
   9 import java.util.Map.Entry;
  10
  11 import jalview.bin.Cache;
  12 import jalview.datamodel.DBRefEntry;
  13 import jalview.datamodel.DBRefSource;
  14 import jalview.datamodel.FeatureProperties;
  15 import jalview.datamodel.Sequence;
  16 import jalview.datamodel.SequenceFeature;
  17 import jalview.datamodel.SequenceI;
  18 import jalview.util.DBRefUtils;
  19 import jalview.util.DnaUtils;
  20 import jalview.util.MappingUtils;
  21
  22 /**
  23  * A class that provides selective parsing of the EMBL flatfile format.
  24  * <p>
  25  * The initial implementation is limited to extracting fields used by Jalview
  26  * after fetching an EMBL or EMBLCDS entry:
  27  *
  28  * <pre>
  29  * accession, version, sequence, xref
  30  * and (for CDS feature) location, protein_id, product, codon_start, translation
  31  * </pre>
  32  *
  33  * For a complete parser, it may be best to adopt that provided in
  34  * https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile
  35  * (but note this has a dependency on the Apache Commons library)
  36  *
  37  * @author gmcarstairs
  38  * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt
  39  * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
  40  */
  41 public class EmblFlatFile extends AlignFile // FileParse
  42 {
  43   /**
  44    * A data bean class to hold values parsed from one CDS Feature (FT)
  45    */
  46   class CdsData
  47   {
  48     String translation; // from CDS feature /translation
  49
  50     String cdsLocation; // CDS /location raw value
  51
  52     int codonStart = 1; // from CDS /codon_start
  53
  54     String proteinName; // from CDS /product; TODO: use for protein description
  55
  56     String proteinId; // from CDS /protein_id
  57
  58     Map<String, String> cdsProps  = new Hashtable<>(); // CDS other qualifiers
  59   }
  60
  61   private static final String WHITESPACE = "\\s+";
  62
  63   private String sourceDb;
  64
  65   /*
  66    * values parsed from the EMBL flatfile record
  67    */
  68   private String accession; // from ID (first token)
  69
  70   private String version; // from ID (second token)
  71
  72   private int length = 128; // from ID (7th token), with usable default
  73
  74   private List<DBRefEntry> dbrefs; // from DR and also CDS /db_xref qualifiers
  75
  76   private String sequenceString; // from SQ lines
  77
  78   private List<CdsData> cds;
  79
  80   /**
  81    * Constructor
  82    *
  83    * @param fp
  84    * @param sourceId
  85    * @throws IOException
  86    */
  87   public EmblFlatFile(FileParse fp, String sourceId) throws IOException
  88   {
  89     super(false, fp); // don't parse immediately
  90     this.sourceDb = sourceId;
  91     dbrefs = new ArrayList<>();
  92     cds = new ArrayList<>();
  93   }
  94
  95   /**
  96    * Parses the flatfile, and if successful, saves as an annotated sequence
  97    * which may be retrieved by calling {@code getSequence()}
  98    *
  99    * @throws IOException
 100    */
 101   public void parse() throws IOException
 102   {
 103     String line = nextLine();
 104     while (line != null)
 105     {
 106       if (line.startsWith("ID"))
 107       {
 108         line = parseID(line);
 109       }
 110       else if (line.startsWith("DR"))
 111       {
 112         line = parseDR(line);
 113       }
 114       else if (line.startsWith("SQ"))
 115       {
 116         line = parseSQ();
 117       }
 118       else if (line.startsWith("FT"))
 119       {
 120         line = parseFT(line);
 121       }
 122       else
 123       {
 124         line = nextLine();
 125       }
 126     }
 127     assembleSequence();
 128   }
 129
 130   /**
 131    * Extracts and saves the primary accession and version (SV value) from an ID
 132    * line, or null if not found. Returns the next line after the one processed.
 133    *
 134    * @param line
 135    * @throws IOException
 136    */
 137   String parseID(String line) throws IOException
 138   {
 139     String[] tokens = line.substring(2).split(";");
 140
 141     /*
 142      * first is primary accession
 143      */
 144     String token = tokens[0].trim();
 145     if (!token.isEmpty())
 146     {
 147       this.accession = token;
 148     }
 149
 150     /*
 151      * second token is 'SV versionNo'
 152      */
 153     if (tokens.length > 1)
 154     {
 155       token = tokens[1].trim();
 156       if (token.startsWith("SV"))
 157       {
 158         String[] bits = token.trim().split(WHITESPACE);
 159         this.version = bits[bits.length - 1];
 160       }
 161     }
 162
 163     /*
 164      * seventh token is 'length BP'
 165      */
 166     if (tokens.length > 6)
 167     {
 168       token = tokens[6].trim();
 169       String[] bits = token.trim().split(WHITESPACE);
 170       try
 171       {
 172         this.length = Integer.valueOf(bits[0]);
 173       } catch (NumberFormatException e)
 174       {
 175         Cache.log.error("bad length read in flatfile, line: " + line);
 176       }
 177     }
 178
 179     return nextLine();
 180   }
 181
 182   /**
 183    * Processes one DR line and saves as a DBRefEntry cross-reference. Returns
 184    * the line following the line processed.
 185    *
 186    * @param line
 187    * @throws IOException
 188    */
 189   String parseDR(String line) throws IOException
 190   {
 191     String[] tokens = line.substring(2).split(";");
 192     if (tokens.length > 1)
 193     {
 194       /*
 195        * ensure UniProtKB/Swiss-Prot converted to UNIPROT
 196        */
 197       String db = tokens[0].trim();
 198       db = DBRefUtils.getCanonicalName(db);
 199       String acc = tokens[1].trim();
 200       if (acc.endsWith("."))
 201       {
 202         acc = acc.substring(0, acc.length() - 1);
 203       }
 204       String version = "0";
 205       if (tokens.length > 2)
 206       {
 207         String secondaryId = tokens[2].trim();
 208         if (!secondaryId.isEmpty())
 209         {
 210           // todo: is this right? secondary id is not a version number
 211       //    version = secondaryId;
 212         }
 213       }
 214       this.dbrefs.add(new DBRefEntry(db, version, acc));
 215     }
 216
 217     return nextLine();
 218   }
 219
 220   /**
 221    * Reads and saves the sequence, read from the lines following the SQ line.
 222    * Whitespace and position counters are discarded. Returns the next line
 223    * following the sequence data (the next line that doesn't start with
 224    * whitespace).
 225    *
 226    * @throws IOException
 227    */
 228   String parseSQ() throws IOException
 229   {
 230     StringBuilder sb = new StringBuilder(this.length);
 231     String line = nextLine();
 232     while (line != null && line.startsWith(" "))
 233     {
 234       line = line.trim();
 235       String[] blocks = line.split(WHITESPACE);
 236
 237       /*
 238        * omit the last block (position counter) on each line
 239        */
 240       for (int i = 0; i < blocks.length - 1; i++)
 241       {
 242         sb.append(blocks[i]);
 243       }
 244       line = nextLine();
 245     }
 246     this.sequenceString = sb.toString();
 247
 248     return line;
 249   }
 250
 251   /**
 252    * Processes an FT line. If it declares a feature type of interest (currently,
 253    * only CDS is processed), processes all of the associated lines (feature
 254    * qualifiers), and returns the next line after that, otherwise simply returns
 255    * the next line.
 256    *
 257    * @param line
 258    * @return
 259    * @throws IOException
 260    */
 261   String parseFT(String line) throws IOException
 262   {
 263     String[] tokens = line.split(WHITESPACE);
 264     if (tokens.length < 3 || !"CDS".equals(tokens[1]))
 265     {
 266       return nextLine();
 267     }
 268
 269     CdsData data = new CdsData();
 270     data.cdsLocation = tokens[2];
 271
 272     line = nextLine();
 273     while (line  != null)
 274     {
 275       if (!line.startsWith("FT    ")) // 4 spaces
 276       {
 277         // e.g. start of next feature "FT source..."
 278         break;
 279       }
 280
 281       /*
 282        * extract qualifier, e.g. FT    /protein_id="CAA37824.1"
 283        */
 284       int slashPos = line.indexOf('/');
 285       if (slashPos == -1)
 286       {
 287         Cache.log.error("Unexpected EMBL line ignored: " + line);
 288         continue;
 289       }
 290       int eqPos = line.indexOf('=', slashPos + 1);
 291       if (eqPos == -1)
 292       {
 293         Cache.log.error("Unexpected EMBL line ignored: " + line);
 294         continue;
 295       }
 296       String qualifier = line.substring(slashPos + 1, eqPos);
 297       String value = line.substring(eqPos + 1);
 298       if (value.startsWith("\"") && value.endsWith("\""))
 299       {
 300         value = value.substring(1, value.length() - 1);
 301       }
 302
 303       if ("protein_id".equals(qualifier))
 304       {
 305         data.proteinId = value;
 306         line = nextLine();
 307       }
 308       else if ("codon_start".equals(qualifier))
 309       {
 310         try
 311         {
 312           data.codonStart = Integer.parseInt(value.trim());
 313         } catch (NumberFormatException e)
 314         {
 315           Cache.log.error("Invalid codon_start in XML for " + this.accession
 316                   + ": " + e.getMessage());
 317         }
 318         line = nextLine();
 319       }
 320       else if ("db_xref".equals(qualifier))
 321       {
 322         String[] parts = value.split(":");
 323         if (parts.length == 2)
 324         {
 325           String db = parts[0].trim();
 326           db = DBRefUtils.getCanonicalName(db);
 327           DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
 328           this.dbrefs.add(dbref);
 329         }
 330         line = nextLine();
 331       }
 332       else if ("product".equals(qualifier))
 333       {
 334         // sometimes name is returned e.g. for V00488
 335         data.proteinName = value;
 336         line = nextLine();
 337       }
 338       else if ("translation".equals(qualifier))
 339       {
 340         line = readTranslation(value, data);
 341       }
 342       else if (!"".equals(value))
 343       {
 344         // throw anything else into the additional properties hash
 345         data.cdsProps.put(qualifier, value);
 346         line = nextLine();
 347       }
 348     }
 349
 350     this.cds.add(data);
 351
 352     return line;
 353   }
 354
 355   /**
 356    * Reads and returns the CDS translation from one or more lines of the file,
 357    * and returns the next line after that
 358    *
 359    * @param value
 360    *          the first line of the translation (likely quoted)
 361    * @param data
 362    * @return
 363    * @throws IOException
 364    */
 365   String readTranslation(String value, CdsData data) throws IOException
 366   {
 367     StringBuilder sb = new StringBuilder(this.length / 3 + 1);
 368     sb.append(value.replace("\"", ""));
 369
 370     String line;
 371     while ((line = nextLine()) != null)
 372     {
 373       if (!line.startsWith("FT    "))
 374       {
 375         break; // reached next feature or other input line
 376       }
 377       String[] tokens = line.split(WHITESPACE);
 378       if (tokens.length < 2)
 379       {
 380         Cache.log.error("Ignoring bad EMBL line: " + line);
 381         break;
 382       }
 383       if (tokens[1].startsWith("/"))
 384       {
 385         break; // next feature qualifier
 386       }
 387       sb.append(tokens[1].replace("\"", ""));
 388     }
 389
 390     data.translation = sb.toString();
 391
 392     return line;
 393   }
 394
 395   /**
 396    * Processes the parsed CDS feature data to
 397    * <ul>
 398    * <li>add a CDS feature to the sequence for each CDS start-end range</li>
 399    * <li>create a protein product sequence for the translation</li>
 400    * <li>create a cross-reference to protein with mapping from dna</li>
 401    * <li>add any CDS dbrefs to the sequence and to the protein product</li>
 402    * </ul>
 403    *
 404    * @param SequenceI
 405    *          dna
 406    */
 407   void processCDS(SequenceI dna, CdsData data)
 408   {
 409     /*
 410      * parse location into a list of [start, end, start, end] positions
 411      */
 412     int[] exons = getCdsRanges(this.accession, data.cdsLocation);
 413     int exonNumber = 0;
 414
 415     for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
 416     {
 417       int exonStart = exons[xint];
 418       int exonEnd = exons[xint + 1];
 419       int begin = Math.min(exonStart, exonEnd);
 420       int end = Math.max(exonStart, exonEnd);
 421       exonNumber++;
 422       String desc = String.format("Exon %d for protein EMBLCDS:%s",
 423               exonNumber, data.proteinId);
 424
 425       SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
 426               this.sourceDb);
 427       for (Entry<String, String> val : data.cdsProps.entrySet())
 428       {
 429         sf.setValue(val.getKey(), val.getValue());
 430       }
 431
 432       sf.setEnaLocation(data.cdsLocation);
 433       boolean forwardStrand = exonStart <= exonEnd;
 434       sf.setStrand(forwardStrand ? "+" : "-");
 435       sf.setPhase(String.valueOf(data.codonStart - 1));
 436       sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 437       sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
 438
 439       dna.addSequenceFeature(sf);
 440     }
 441   }
 442
 443   /**
 444    * Constructs a sequence for the protein product (if there is one), and dbrefs
 445    * with mappings from dna to protein and the reverse
 446    */
 447   void processTranslation()
 448   {
 449     // TODO Auto-generated method stub
 450
 451   }
 452
 453   /**
 454    * Constructs and saves the sequence from parsed components
 455    */
 456   void assembleSequence()
 457   {
 458     String name = this.accession;
 459     if (this.sourceDb != null)
 460     {
 461       name = this.sourceDb + "|" + name;
 462     }
 463     SequenceI seq = new Sequence(name, this.sequenceString);
 464     for (DBRefEntry dbref : this.dbrefs)
 465     {
 466       seq.addDBRef(dbref);
 467     }
 468
 469     for (CdsData data : cds)
 470     {
 471       processCDS(seq, data);
 472     };
 473
 474     processTranslation();
 475
 476     seq.deriveSequence();
 477
 478     addSequence(seq);
 479   }
 480
 481   /**
 482    * Output (print) is not implemented for EMBL flat file format
 483    */
 484   @Override
 485   public String print(SequenceI[] seqs, boolean jvsuffix)
 486   {
 487     return null;
 488   }
 489
 490   /**
 491    * Returns the CDS location as a single array of [start, end, start, end...]
 492    * positions. If on the reverse strand, these will be in descending order.
 493    *
 494    * @param accession
 495    * @param location
 496    * @return
 497    */
 498   protected int[] getCdsRanges(String accession, String location)
 499   {
 500     if (location == null)
 501     {
 502       return new int[] {};
 503     }
 504
 505     try
 506     {
 507       List<int[]> ranges = DnaUtils.parseLocation(location);
 508       return MappingUtils.listToArray(ranges);
 509     } catch (ParseException e)
 510     {
 511       Cache.log.warn(
 512               String.format("Not parsing inexact CDS location %s in ENA %s",
 513                       location, accession));
 514       return new int[] {};
 515     }
 516   }
 517 }