X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2FEmblFlatFile.java;h=33701f6934c991f3918174c268523b544d9073bf;hb=0b7d63e48815f1f80b24049ce272d2e0241e07f2;hp=759fa28302f46557330586541aee5cd557c581b4;hpb=fe3cd724aecdeb06a130a502ce3a967ad643f458;p=jalview.git diff --git a/src/jalview/io/EmblFlatFile.java b/src/jalview/io/EmblFlatFile.java index 759fa28..33701f6 100644 --- a/src/jalview/io/EmblFlatFile.java +++ b/src/jalview/io/EmblFlatFile.java @@ -1,22 +1,30 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ package jalview.io; import java.io.IOException; -import java.text.ParseException; -import java.util.ArrayList; -import java.util.Hashtable; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import jalview.bin.Cache; +import jalview.bin.Console; import jalview.datamodel.DBRefEntry; -import jalview.datamodel.DBRefSource; -import jalview.datamodel.FeatureProperties; -import jalview.datamodel.Sequence; -import jalview.datamodel.SequenceFeature; -import jalview.datamodel.SequenceI; -import jalview.util.DnaUtils; -import jalview.util.MappingUtils; +import jalview.util.DBRefUtils; /** * A class that provides selective parsing of the EMBL flatfile format. @@ -37,49 +45,18 @@ import jalview.util.MappingUtils; * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html */ -public class EmblFlatFile extends AlignFile // FileParse +public class EmblFlatFile extends EMBLLikeFlatFile { - private static final String WHITESPACE = "\\s+"; - - private String sourceDb; - - /* - * values parsed from the EMBL flatfile record - */ - private String accession; // from ID (first token) - - private String version; // from ID (second token) - - private int length = 128; // from ID (7th token), with usable default - - private List dbrefs; // from DR and also CDS /db_xref qualifiers - - private String sequenceString; // from SQ lines - - private String translation; // from CDS feature /translation - - private String cdsLocation; // CDS /location raw value - - private int codonStart = 1; // from CDS /codon_start - - private String proteinName; // from CDS /product - - private String proteinId; // from CDS /protein_id - - private Map cdsProps; // CDS other qualifiers e.g. 'note' - /** - * Constructor + * Constructor given a data source and the id of the source database + * * @param fp * @param sourceId * @throws IOException */ public EmblFlatFile(FileParse fp, String sourceId) throws IOException { - super(false, fp); // don't parse immediately - this.sourceDb = sourceId; - dbrefs = new ArrayList<>(); - cdsProps = new Hashtable<>(); + super(fp, sourceId); } /** @@ -88,6 +65,7 @@ public class EmblFlatFile extends AlignFile // FileParse * * @throws IOException */ + @Override public void parse() throws IOException { String line = nextLine(); @@ -95,26 +73,30 @@ public class EmblFlatFile extends AlignFile // FileParse { if (line.startsWith("ID")) { - line = processID(line); + line = parseID(line); + } + else if (line.startsWith("DE")) + { + line = parseDE(line); } else if (line.startsWith("DR")) { - line = processDR(line); + line = parseDR(line); } else if (line.startsWith("SQ")) { - line = processSQ(); + line = parseSequence(); } else if (line.startsWith("FT")) { - line = processFT(line); + line = parseFeature(line.substring(2)); } else { line = nextLine(); } } - assembleSequence(); + buildSequence(); } /** @@ -124,7 +106,7 @@ public class EmblFlatFile extends AlignFile // FileParse * @param line * @throws IOException */ - String processID(String line) throws IOException + String parseID(String line) throws IOException { String[] tokens = line.substring(2).split(";"); @@ -162,7 +144,7 @@ public class EmblFlatFile extends AlignFile // FileParse this.length = Integer.valueOf(bits[0]); } catch (NumberFormatException e) { - Cache.log.error("bad length read in flatfile, line: " + line); + Console.error("bad length read in flatfile, line: " + line); } } @@ -170,286 +152,78 @@ public class EmblFlatFile extends AlignFile // FileParse } /** - * Processes one DR line and saves as a DBRefEntry cross-reference. Returns - * the line following the line processed. + * Reads sequence description from the first DE line found. Any trailing + * period is discarded. If there are multiple DE lines, only the first (short + * description) is read, the rest are ignored. * * @param line + * @return * @throws IOException */ - String processDR(String line) throws IOException + String parseDE(String line) throws IOException { - String[] tokens = line.substring(2).split(";"); - if (tokens.length > 1) + String desc = line.substring(2).trim(); + if (desc.endsWith(".")) { - String db = tokens[0].trim(); - String acc = tokens[1].trim(); - if (acc.endsWith(".")) - { - acc = acc.substring(0, acc.length() - 1); - } - this.dbrefs.add(new DBRefEntry(db, "0", acc)); + desc = desc.substring(0, desc.length() - 1); } + this.description = desc; - return nextLine(); - } - - /** - * Reads and saves the sequence, read from the lines following the SQ line. - * Whitespace and position counters are discarded. Returns the next line - * following the sequence data (the next line that doesn't start with - * whitespace). - * - * @throws IOException - */ - String processSQ() throws IOException - { - StringBuilder sb = new StringBuilder(this.length); - String line = nextLine(); - while (line != null && line.startsWith(" ")) + /* + * pass over any additional DE lines + */ + while ((line = nextLine()) != null) { - line = line.trim(); - String[] blocks = line.split(WHITESPACE); - - /* - * omit the last block (position counter) on each line - */ - for (int i = 0; i < blocks.length - 1; i++) + if (!line.startsWith("DE")) { - sb.append(blocks[i]); + break; } - line = nextLine(); } - this.sequenceString = sb.toString(); return line; } /** - * Processes an FT line. If it declares a feature type of interest (currently, - * only CDS is processed), processes all of the associated lines (feature - * qualifiers), and returns the next line after that, otherwise simply returns - * the next line. + * Processes one DR line and saves as a DBRefEntry cross-reference. Returns + * the line following the line processed. * * @param line - * @return * @throws IOException */ - String processFT(String line) throws IOException + String parseDR(String line) throws IOException { - String[] tokens = line.split(WHITESPACE); - if (tokens.length < 3 || !"CDS".equals(tokens[1])) - { - return nextLine(); - } - - this.cdsLocation = tokens[2]; - - while ((line = nextLine()) != null) + String[] tokens = line.substring(2).split(";"); + if (tokens.length > 1) { - if (!line.startsWith("FT ")) // 4 spaces - { - // e.g. start of next feature "FT source..." - break; - } - /* - * extract qualifier, e.g. FT /protein_id="CAA37824.1" + * ensure UniProtKB/Swiss-Prot converted to UNIPROT */ - int slashPos = line.indexOf('/'); - if (slashPos == -1) - { - Cache.log.error("Unexpected EMBL line ignored: " + line); - continue; - } - int eqPos = line.indexOf('=', slashPos + 1); - if (eqPos == -1) - { - Cache.log.error("Unexpected EMBL line ignored: " + line); - continue; - } - String qualifier = line.substring(slashPos + 1, eqPos); - String value = line.substring(eqPos + 1); - if (value.startsWith("\"") && value.endsWith("\"")) - { - value = value.substring(1, value.length() - 1); - } - - if ("protein_id".equals(qualifier)) - { - proteinId = value; - } - else if ("codon_start".equals(qualifier)) - { - try - { - codonStart = Integer.parseInt(value.trim()); - } catch (NumberFormatException e) - { - Cache.log.error("Invalid codon_start in XML for " + this.accession - + ": " + e.getMessage()); - } - } - else if ("product".equals(qualifier)) - { - // sometimes name is returned e.g. for V00488 - proteinName = value; - } - else if ("translation".equals(qualifier)) - { - line = readTranslation(value); - } - else if (!"".equals(value)) - { - // throw anything else into the additional properties hash - cdsProps.put(qualifier, value); - } - } - - return line; - } - - /** - * Reads and saves the CDS translation from one or more lines of the file, and - * returns the next line after that - * - * @param value - * the first line of the translation (likely quoted) - * @return - * @throws IOException - */ - String readTranslation(String value) throws IOException - { - StringBuilder sb = new StringBuilder(this.length / 3 + 1); - sb.append(value.replace("\"", "")); - - String line; - while ((line = nextLine()) != null) - { - if (!line.startsWith("FT ")) - { - break; // reached next feature or other input line - } - String[] tokens = line.split(WHITESPACE); - if (tokens.length < 2) - { - Cache.log.error("Ignoring bad EMBL line: " + line); - break; - } - if (tokens[1].startsWith("/")) + String db = tokens[0].trim(); + db = DBRefUtils.getCanonicalName(db); + String acc = tokens[1].trim(); + if (acc.endsWith(".")) { - break; // next feature qualifier + acc = acc.substring(0, acc.length() - 1); } - sb.append(tokens[1].replace("\"", "")); - } - - return sb.toString(); - } - - /** - * Processes the parsed CDS feature data to - *
    - *
  • add a CDS feature to the sequence for each CDS start-end range
  • - *
  • create a protein product sequence for the translation
  • - *
  • create a cross-reference to protein with mapping from dna
  • - *
  • add any CDS dbrefs to the sequence and to the protein product
  • - *
- * @param SequenceI dna - */ - void processCDS(SequenceI dna) - { - /* - * parse location into a list of [start, end, start, end] positions - */ - int[] exons = getCdsRanges(this.accession, this.cdsLocation); - int exonNumber = 0; - - for (int xint = 0; exons != null - && xint < exons.length - 1; xint += 2) - { - int exonStart = exons[xint]; - int exonEnd = exons[xint + 1]; - int begin = Math.min(exonStart, exonEnd); - int end = Math.max(exonStart, exonEnd); - exonNumber++; - String desc = String.format("Exon %d for protein EMBLCDS:%s", - exonNumber, proteinId); - - SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end, this.sourceDb); - if (!cdsProps.isEmpty()) + String version = "0"; + if (tokens.length > 2) { - for (Entry val : cdsProps.entrySet()) + String secondaryId = tokens[2].trim(); + if (!secondaryId.isEmpty()) { - sf.setValue(val.getKey(), val.getValue()); + // todo: is this right? secondary id is not a version number + // version = secondaryId; } } - - sf.setEnaLocation(this.cdsLocation); - boolean forwardStrand = exonStart <= exonEnd; - sf.setStrand(forwardStrand ? "+" : "-"); - sf.setPhase(String.valueOf(codonStart - 1)); - sf.setValue(FeatureProperties.EXONPOS, exonNumber); - sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); - - dna.addSequenceFeature(sf); + this.dbrefs.add(new DBRefEntry(db, version, acc)); } - } - /** - * Constructs and saves the sequence from parsed components - */ - void assembleSequence() - { - String name = this.accession; - if (this.sourceDb != null) - { - name = this.sourceDb + "|" + name; - } - SequenceI seq = new Sequence(name, this.sequenceString); - for (DBRefEntry dbref : this.dbrefs) - { - seq.addDBRef(dbref); - } - - processCDS(seq); - seq.deriveSequence(); - - addSequence(seq); + return nextLine(); } - /** - * Output (print) is not implemented for EMBL flat file format - */ @Override - public String print(SequenceI[] seqs, boolean jvsuffix) + protected boolean isFeatureContinuationLine(String line) { - return null; - } - - /** - * Returns the CDS location as a single array of [start, end, start, end...] - * positions. If on the reverse strand, these will be in descending order. - * - * @param accession - * @param location - * @return - */ - protected int[] getCdsRanges(String accession, String location) - { - if (location == null) - { - return new int[] {}; - } - - try - { - List ranges = DnaUtils.parseLocation(location); - return MappingUtils.listToArray(ranges); - } catch (ParseException e) - { - Cache.log.warn( - String.format("Not parsing inexact CDS location %s in ENA %s", - location, accession)); - return new int[] {}; - } + return line.startsWith("FT "); // 4 spaces } }