src/jalview/io/EmblFlatFile.java

   1 package jalview.io;
   2
   3 import java.io.IOException;
   4 import java.text.ParseException;
   5 import java.util.ArrayList;
   6 import java.util.Hashtable;
   7 import java.util.List;
   8 import java.util.Map;
   9 import java.util.Map.Entry;
  10
  11 import jalview.bin.Cache;
  12 import jalview.datamodel.DBRefEntry;
  13 import jalview.datamodel.DBRefSource;
  14 import jalview.datamodel.FeatureProperties;
  15 import jalview.datamodel.Sequence;
  16 import jalview.datamodel.SequenceFeature;
  17 import jalview.datamodel.SequenceI;
  18 import jalview.util.DnaUtils;
  19 import jalview.util.MappingUtils;
  20
  21 /**
  22  * A class that provides selective parsing of the EMBL flatfile format.
  23  * <p>
  24  * The initial implementation is limited to extracting fields used by Jalview
  25  * after fetching an EMBL or EMBLCDS entry:
  26  *
  27  * <pre>
  28  * accession, version, sequence, xref
  29  * and (for CDS feature) location, protein_id, product, codon_start, translation
  30  * </pre>
  31  *
  32  * For a complete parser, it may be best to adopt that provided in
  33  * https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile
  34  * (but note this has a dependency on the Apache Commons library)
  35  *
  36  * @author gmcarstairs
  37  * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt
  38  * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html
  39  */
  40 public class EmblFlatFile extends AlignFile // FileParse
  41 {
  42   private static final String WHITESPACE = "\\s+";
  43
  44   private String sourceDb;
  45
  46   /*
  47    * values parsed from the EMBL flatfile record
  48    */
  49   private String accession; // from ID (first token)
  50
  51   private String version; // from ID (second token)
  52
  53   private int length = 128; // from ID (7th token), with usable default
  54
  55   private List<DBRefEntry> dbrefs; // from DR and also CDS /db_xref qualifiers
  56
  57   private String sequenceString; // from SQ lines
  58
  59   private String translation; // from CDS feature /translation
  60
  61   private String cdsLocation; // CDS /location raw value
  62
  63   private int codonStart = 1; // from CDS /codon_start
  64
  65   private String proteinName; // from CDS /product
  66
  67   private String proteinId; // from CDS /protein_id
  68
  69   private Map<String, String> cdsProps; // CDS other qualifiers e.g. 'note'
  70
  71   /**
  72    * Constructor
  73    * @param fp
  74    * @param sourceId
  75    * @throws IOException
  76    */
  77   public EmblFlatFile(FileParse fp, String sourceId) throws IOException
  78   {
  79     super(false, fp); // don't parse immediately
  80     this.sourceDb = sourceId;
  81     dbrefs = new ArrayList<>();
  82     cdsProps = new Hashtable<>();
  83   }
  84
  85   /**
  86    * Parses the flatfile, and if successful, saves as an annotated sequence
  87    * which may be retrieved by calling {@code getSequence()}
  88    *
  89    * @throws IOException
  90    */
  91   public void parse() throws IOException
  92   {
  93     String line = nextLine();
  94     while (line != null)
  95     {
  96       if (line.startsWith("ID"))
  97       {
  98         line = processID(line);
  99       }
 100       else if (line.startsWith("DR"))
 101       {
 102         line = processDR(line);
 103       }
 104       else if (line.startsWith("SQ"))
 105       {
 106         line = processSQ();
 107       }
 108       else if (line.startsWith("FT"))
 109       {
 110         line = processFT(line);
 111       }
 112       else
 113       {
 114         line = nextLine();
 115       }
 116     }
 117     assembleSequence();
 118   }
 119
 120   /**
 121    * Extracts and saves the primary accession and version (SV value) from an ID
 122    * line, or null if not found. Returns the next line after the one processed.
 123    *
 124    * @param line
 125    * @throws IOException
 126    */
 127   String processID(String line) throws IOException
 128   {
 129     String[] tokens = line.substring(2).split(";");
 130
 131     /*
 132      * first is primary accession
 133      */
 134     String token = tokens[0].trim();
 135     if (!token.isEmpty())
 136     {
 137       this.accession = token;
 138     }
 139
 140     /*
 141      * second token is 'SV versionNo'
 142      */
 143     if (tokens.length > 1)
 144     {
 145       token = tokens[1].trim();
 146       if (token.startsWith("SV"))
 147       {
 148         String[] bits = token.trim().split(WHITESPACE);
 149         this.version = bits[bits.length - 1];
 150       }
 151     }
 152
 153     /*
 154      * seventh token is 'length BP'
 155      */
 156     if (tokens.length > 6)
 157     {
 158       token = tokens[6].trim();
 159       String[] bits = token.trim().split(WHITESPACE);
 160       try
 161       {
 162         this.length = Integer.valueOf(bits[0]);
 163       } catch (NumberFormatException e)
 164       {
 165         Cache.log.error("bad length read in flatfile, line: " + line);
 166       }
 167     }
 168
 169     return nextLine();
 170   }
 171
 172   /**
 173    * Processes one DR line and saves as a DBRefEntry cross-reference. Returns
 174    * the line following the line processed.
 175    *
 176    * @param line
 177    * @throws IOException
 178    */
 179   String processDR(String line) throws IOException
 180   {
 181     String[] tokens = line.substring(2).split(";");
 182     if (tokens.length > 1)
 183     {
 184       String db = tokens[0].trim();
 185       String acc = tokens[1].trim();
 186       if (acc.endsWith("."))
 187       {
 188         acc = acc.substring(0, acc.length() - 1);
 189       }
 190       this.dbrefs.add(new DBRefEntry(db, "0", acc));
 191     }
 192
 193     return nextLine();
 194   }
 195
 196   /**
 197    * Reads and saves the sequence, read from the lines following the SQ line.
 198    * Whitespace and position counters are discarded. Returns the next line
 199    * following the sequence data (the next line that doesn't start with
 200    * whitespace).
 201    *
 202    * @throws IOException
 203    */
 204   String processSQ() throws IOException
 205   {
 206     StringBuilder sb = new StringBuilder(this.length);
 207     String line = nextLine();
 208     while (line != null && line.startsWith(" "))
 209     {
 210       line = line.trim();
 211       String[] blocks = line.split(WHITESPACE);
 212
 213       /*
 214        * omit the last block (position counter) on each line
 215        */
 216       for (int i = 0; i < blocks.length - 1; i++)
 217       {
 218         sb.append(blocks[i]);
 219       }
 220       line = nextLine();
 221     }
 222     this.sequenceString = sb.toString();
 223
 224     return line;
 225   }
 226
 227   /**
 228    * Processes an FT line. If it declares a feature type of interest (currently,
 229    * only CDS is processed), processes all of the associated lines (feature
 230    * qualifiers), and returns the next line after that, otherwise simply returns
 231    * the next line.
 232    *
 233    * @param line
 234    * @return
 235    * @throws IOException
 236    */
 237   String processFT(String line) throws IOException
 238   {
 239     String[] tokens = line.split(WHITESPACE);
 240     if (tokens.length < 3 || !"CDS".equals(tokens[1]))
 241     {
 242       return nextLine();
 243     }
 244
 245     this.cdsLocation = tokens[2];
 246
 247     while ((line = nextLine()) != null)
 248     {
 249       if (!line.startsWith("FT    ")) // 4 spaces
 250       {
 251         // e.g. start of next feature "FT source..."
 252         break;
 253       }
 254
 255       /*
 256        * extract qualifier, e.g. FT    /protein_id="CAA37824.1"
 257        */
 258       int slashPos = line.indexOf('/');
 259       if (slashPos == -1)
 260       {
 261         Cache.log.error("Unexpected EMBL line ignored: " + line);
 262         continue;
 263       }
 264       int eqPos = line.indexOf('=', slashPos + 1);
 265       if (eqPos == -1)
 266       {
 267         Cache.log.error("Unexpected EMBL line ignored: " + line);
 268         continue;
 269       }
 270       String qualifier = line.substring(slashPos + 1, eqPos);
 271       String value = line.substring(eqPos + 1);
 272       if (value.startsWith("\"") && value.endsWith("\""))
 273       {
 274         value = value.substring(1, value.length() - 1);
 275       }
 276
 277       if ("protein_id".equals(qualifier))
 278       {
 279         proteinId = value;
 280       }
 281       else if ("codon_start".equals(qualifier))
 282       {
 283         try
 284         {
 285           codonStart = Integer.parseInt(value.trim());
 286         } catch (NumberFormatException e)
 287         {
 288           Cache.log.error("Invalid codon_start in XML for " + this.accession
 289                   + ": " + e.getMessage());
 290         }
 291       }
 292       else if ("product".equals(qualifier))
 293       {
 294         // sometimes name is returned e.g. for V00488
 295         proteinName = value;
 296       }
 297       else if ("translation".equals(qualifier))
 298       {
 299         line = readTranslation(value);
 300       }
 301       else if (!"".equals(value))
 302       {
 303         // throw anything else into the additional properties hash
 304         cdsProps.put(qualifier, value);
 305       }
 306     }
 307
 308     return line;
 309   }
 310
 311   /**
 312    * Reads and saves the CDS translation from one or more lines of the file, and
 313    * returns the next line after that
 314    *
 315    * @param value
 316    *          the first line of the translation (likely quoted)
 317    * @return
 318    * @throws IOException
 319    */
 320   String readTranslation(String value) throws IOException
 321   {
 322     StringBuilder sb = new StringBuilder(this.length / 3 + 1);
 323     sb.append(value.replace("\"", ""));
 324
 325     String line;
 326     while ((line = nextLine()) != null)
 327     {
 328       if (!line.startsWith("FT    "))
 329       {
 330         break; // reached next feature or other input line
 331       }
 332       String[] tokens = line.split(WHITESPACE);
 333       if (tokens.length < 2)
 334       {
 335         Cache.log.error("Ignoring bad EMBL line: " + line);
 336         break;
 337       }
 338       if (tokens[1].startsWith("/"))
 339       {
 340         break; // next feature qualifier
 341       }
 342       sb.append(tokens[1].replace("\"", ""));
 343     }
 344
 345     return sb.toString();
 346   }
 347
 348   /**
 349    * Processes the parsed CDS feature data to
 350    * <ul>
 351    * <li>add a CDS feature to the sequence for each CDS start-end range</li>
 352    * <li>create a protein product sequence for the translation</li>
 353    * <li>create a cross-reference to protein with mapping from dna</li>
 354    * <li>add any CDS dbrefs to the sequence and to the protein product</li>
 355    * </ul>
 356    * @param SequenceI dna
 357    */
 358   void processCDS(SequenceI dna)
 359   {
 360     /*
 361      * parse location into a list of [start, end, start, end] positions
 362      */
 363     int[] exons = getCdsRanges(this.accession, this.cdsLocation);
 364     int exonNumber = 0;
 365
 366     for (int xint = 0; exons != null
 367             && xint < exons.length - 1; xint += 2)
 368     {
 369       int exonStart = exons[xint];
 370       int exonEnd = exons[xint + 1];
 371       int begin = Math.min(exonStart, exonEnd);
 372       int end = Math.max(exonStart, exonEnd);
 373       exonNumber++;
 374       String desc = String.format("Exon %d for protein EMBLCDS:%s",
 375               exonNumber, proteinId);
 376
 377       SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end, this.sourceDb);
 378       if (!cdsProps.isEmpty())
 379       {
 380         for (Entry<String, String> val : cdsProps.entrySet())
 381         {
 382           sf.setValue(val.getKey(), val.getValue());
 383         }
 384       }
 385
 386       sf.setEnaLocation(this.cdsLocation);
 387       boolean forwardStrand = exonStart <= exonEnd;
 388       sf.setStrand(forwardStrand ? "+" : "-");
 389       sf.setPhase(String.valueOf(codonStart - 1));
 390       sf.setValue(FeatureProperties.EXONPOS, exonNumber);
 391       sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
 392
 393       dna.addSequenceFeature(sf);
 394     }
 395   }
 396
 397   /**
 398    * Constructs and saves the sequence from parsed components
 399    */
 400   void assembleSequence()
 401   {
 402     String name = this.accession;
 403     if (this.sourceDb != null)
 404     {
 405       name = this.sourceDb + "|" + name;
 406     }
 407     SequenceI seq = new Sequence(name, this.sequenceString);
 408     for (DBRefEntry dbref : this.dbrefs)
 409     {
 410       seq.addDBRef(dbref);
 411     }
 412
 413     processCDS(seq);
 414     seq.deriveSequence();
 415
 416     addSequence(seq);
 417   }
 418
 419   /**
 420    * Output (print) is not implemented for EMBL flat file format
 421    */
 422   @Override
 423   public String print(SequenceI[] seqs, boolean jvsuffix)
 424   {
 425     return null;
 426   }
 427
 428   /**
 429    * Returns the CDS location as a single array of [start, end, start, end...]
 430    * positions. If on the reverse strand, these will be in descending order.
 431    *
 432    * @param accession
 433    * @param location
 434    * @return
 435    */
 436   protected int[] getCdsRanges(String accession, String location)
 437   {
 438     if (location == null)
 439     {
 440       return new int[] {};
 441     }
 442
 443     try
 444     {
 445       List<int[]> ranges = DnaUtils.parseLocation(location);
 446       return MappingUtils.listToArray(ranges);
 447     } catch (ParseException e)
 448     {
 449       Cache.log.warn(
 450               String.format("Not parsing inexact CDS location %s in ENA %s",
 451                       location, accession));
 452       return new int[] {};
 453     }
 454   }
 455 }