X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2FEmblFlatFile.java;fp=src%2Fjalview%2Fio%2FEmblFlatFile.java;h=9214f7e7963ab31a967f91d104018e54150865a8;hb=3338d9ab2b7587db8f1899cdf42e0d666dd0f1a8;hp=759fa28302f46557330586541aee5cd557c581b4;hpb=0a37e3b824b46b026916e124b42400590242d145;p=jalview.git diff --git a/src/jalview/io/EmblFlatFile.java b/src/jalview/io/EmblFlatFile.java index 759fa28..9214f7e 100644 --- a/src/jalview/io/EmblFlatFile.java +++ b/src/jalview/io/EmblFlatFile.java @@ -15,6 +15,7 @@ import jalview.datamodel.FeatureProperties; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; +import jalview.util.DBRefUtils; import jalview.util.DnaUtils; import jalview.util.MappingUtils; @@ -39,10 +40,28 @@ import jalview.util.MappingUtils; */ public class EmblFlatFile extends AlignFile // FileParse { + /** + * A data bean class to hold values parsed from one CDS Feature (FT) + */ + class CdsData + { + String translation; // from CDS feature /translation + + String cdsLocation; // CDS /location raw value + + int codonStart = 1; // from CDS /codon_start + + String proteinName; // from CDS /product; TODO: use for protein description + + String proteinId; // from CDS /protein_id + + Map cdsProps = new Hashtable<>(); // CDS other qualifiers + } + private static final String WHITESPACE = "\\s+"; private String sourceDb; - + /* * values parsed from the EMBL flatfile record */ @@ -56,20 +75,11 @@ public class EmblFlatFile extends AlignFile // FileParse private String sequenceString; // from SQ lines - private String translation; // from CDS feature /translation - - private String cdsLocation; // CDS /location raw value - - private int codonStart = 1; // from CDS /codon_start - - private String proteinName; // from CDS /product - - private String proteinId; // from CDS /protein_id - - private Map cdsProps; // CDS other qualifiers e.g. 'note' - + private List cds; + /** * Constructor + * * @param fp * @param sourceId * @throws IOException @@ -79,7 +89,7 @@ public class EmblFlatFile extends AlignFile // FileParse super(false, fp); // don't parse immediately this.sourceDb = sourceId; dbrefs = new ArrayList<>(); - cdsProps = new Hashtable<>(); + cds = new ArrayList<>(); } /** @@ -95,19 +105,19 @@ public class EmblFlatFile extends AlignFile // FileParse { if (line.startsWith("ID")) { - line = processID(line); + line = parseID(line); } else if (line.startsWith("DR")) { - line = processDR(line); + line = parseDR(line); } else if (line.startsWith("SQ")) { - line = processSQ(); + line = parseSQ(); } else if (line.startsWith("FT")) { - line = processFT(line); + line = parseFT(line); } else { @@ -124,7 +134,7 @@ public class EmblFlatFile extends AlignFile // FileParse * @param line * @throws IOException */ - String processID(String line) throws IOException + String parseID(String line) throws IOException { String[] tokens = line.substring(2).split(";"); @@ -176,18 +186,32 @@ public class EmblFlatFile extends AlignFile // FileParse * @param line * @throws IOException */ - String processDR(String line) throws IOException + String parseDR(String line) throws IOException { String[] tokens = line.substring(2).split(";"); if (tokens.length > 1) { + /* + * ensure UniProtKB/Swiss-Prot converted to UNIPROT + */ String db = tokens[0].trim(); + db = DBRefUtils.getCanonicalName(db); String acc = tokens[1].trim(); if (acc.endsWith(".")) { acc = acc.substring(0, acc.length() - 1); } - this.dbrefs.add(new DBRefEntry(db, "0", acc)); + String version = "0"; + if (tokens.length > 2) + { + String secondaryId = tokens[2].trim(); + if (!secondaryId.isEmpty()) + { + // todo: is this right? secondary id is not a version number + // version = secondaryId; + } + } + this.dbrefs.add(new DBRefEntry(db, version, acc)); } return nextLine(); @@ -201,7 +225,7 @@ public class EmblFlatFile extends AlignFile // FileParse * * @throws IOException */ - String processSQ() throws IOException + String parseSQ() throws IOException { StringBuilder sb = new StringBuilder(this.length); String line = nextLine(); @@ -234,7 +258,7 @@ public class EmblFlatFile extends AlignFile // FileParse * @return * @throws IOException */ - String processFT(String line) throws IOException + String parseFT(String line) throws IOException { String[] tokens = line.split(WHITESPACE); if (tokens.length < 3 || !"CDS".equals(tokens[1])) @@ -242,9 +266,11 @@ public class EmblFlatFile extends AlignFile // FileParse return nextLine(); } - this.cdsLocation = tokens[2]; + CdsData data = new CdsData(); + data.cdsLocation = tokens[2]; - while ((line = nextLine()) != null) + line = nextLine(); + while (line != null) { if (!line.startsWith("FT ")) // 4 spaces { @@ -276,48 +302,67 @@ public class EmblFlatFile extends AlignFile // FileParse if ("protein_id".equals(qualifier)) { - proteinId = value; + data.proteinId = value; + line = nextLine(); } else if ("codon_start".equals(qualifier)) { try { - codonStart = Integer.parseInt(value.trim()); + data.codonStart = Integer.parseInt(value.trim()); } catch (NumberFormatException e) { Cache.log.error("Invalid codon_start in XML for " + this.accession + ": " + e.getMessage()); } + line = nextLine(); + } + else if ("db_xref".equals(qualifier)) + { + String[] parts = value.split(":"); + if (parts.length == 2) + { + String db = parts[0].trim(); + db = DBRefUtils.getCanonicalName(db); + DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim()); + this.dbrefs.add(dbref); + } + line = nextLine(); } else if ("product".equals(qualifier)) { // sometimes name is returned e.g. for V00488 - proteinName = value; + data.proteinName = value; + line = nextLine(); } else if ("translation".equals(qualifier)) { - line = readTranslation(value); + line = readTranslation(value, data); } else if (!"".equals(value)) { // throw anything else into the additional properties hash - cdsProps.put(qualifier, value); + data.cdsProps.put(qualifier, value); + line = nextLine(); } } + + this.cds.add(data); return line; } /** - * Reads and saves the CDS translation from one or more lines of the file, and - * returns the next line after that + * Reads and returns the CDS translation from one or more lines of the file, + * and returns the next line after that * * @param value * the first line of the translation (likely quoted) + * @param data * @return * @throws IOException */ - String readTranslation(String value) throws IOException + String readTranslation(String value, CdsData data) throws IOException { StringBuilder sb = new StringBuilder(this.length / 3 + 1); sb.append(value.replace("\"", "")); @@ -342,7 +387,9 @@ public class EmblFlatFile extends AlignFile // FileParse sb.append(tokens[1].replace("\"", "")); } - return sb.toString(); + data.translation = sb.toString(); + + return line; } /** @@ -351,20 +398,21 @@ public class EmblFlatFile extends AlignFile // FileParse *
  • add a CDS feature to the sequence for each CDS start-end range
  • *
  • create a protein product sequence for the translation
  • *
  • create a cross-reference to protein with mapping from dna
  • - *
  • add any CDS dbrefs to the sequence and to the protein product
  • + *
  • add any CDS dbrefs to the sequence and to the protein product
  • * - * @param SequenceI dna + * + * @param SequenceI + * dna */ - void processCDS(SequenceI dna) + void processCDS(SequenceI dna, CdsData data) { /* * parse location into a list of [start, end, start, end] positions */ - int[] exons = getCdsRanges(this.accession, this.cdsLocation); + int[] exons = getCdsRanges(this.accession, data.cdsLocation); int exonNumber = 0; - - for (int xint = 0; exons != null - && xint < exons.length - 1; xint += 2) + + for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2) { int exonStart = exons[xint]; int exonEnd = exons[xint + 1]; @@ -372,29 +420,37 @@ public class EmblFlatFile extends AlignFile // FileParse int end = Math.max(exonStart, exonEnd); exonNumber++; String desc = String.format("Exon %d for protein EMBLCDS:%s", - exonNumber, proteinId); + exonNumber, data.proteinId); - SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end, this.sourceDb); - if (!cdsProps.isEmpty()) + SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end, + this.sourceDb); + for (Entry val : data.cdsProps.entrySet()) { - for (Entry val : cdsProps.entrySet()) - { - sf.setValue(val.getKey(), val.getValue()); - } + sf.setValue(val.getKey(), val.getValue()); } - sf.setEnaLocation(this.cdsLocation); + sf.setEnaLocation(data.cdsLocation); boolean forwardStrand = exonStart <= exonEnd; sf.setStrand(forwardStrand ? "+" : "-"); - sf.setPhase(String.valueOf(codonStart - 1)); + sf.setPhase(String.valueOf(data.codonStart - 1)); sf.setValue(FeatureProperties.EXONPOS, exonNumber); - sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); + sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName); dna.addSequenceFeature(sf); } } /** + * Constructs a sequence for the protein product (if there is one), and dbrefs + * with mappings from dna to protein and the reverse + */ + void processTranslation() + { + // TODO Auto-generated method stub + + } + + /** * Constructs and saves the sequence from parsed components */ void assembleSequence() @@ -409,10 +465,16 @@ public class EmblFlatFile extends AlignFile // FileParse { seq.addDBRef(dbref); } - - processCDS(seq); + + for (CdsData data : cds) + { + processCDS(seq, data); + }; + + processTranslation(); + seq.deriveSequence(); - + addSequence(seq); }