From d4ed63b811b2b57149af11ba0ccbe11c6845acba Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 24 Jul 2020 09:38:45 +0100 Subject: [PATCH] JAL-3692 parse DE for description, and other refactoring... --- src/jalview/io/EmblFlatFile.java | 186 ++++++++++++++++++++++++--------- test/jalview/io/EmblFlatFileTest.java | 24 +++-- 2 files changed, 154 insertions(+), 56 deletions(-) diff --git a/src/jalview/io/EmblFlatFile.java b/src/jalview/io/EmblFlatFile.java index 9214f7e..5be4364 100644 --- a/src/jalview/io/EmblFlatFile.java +++ b/src/jalview/io/EmblFlatFile.java @@ -3,6 +3,7 @@ package jalview.io; import java.io.IOException; import java.text.ParseException; import java.util.ArrayList; +import java.util.HashMap; import java.util.Hashtable; import java.util.List; import java.util.Map; @@ -10,8 +11,8 @@ import java.util.Map.Entry; import jalview.bin.Cache; import jalview.datamodel.DBRefEntry; -import jalview.datamodel.DBRefSource; import jalview.datamodel.FeatureProperties; +import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; @@ -51,11 +52,11 @@ public class EmblFlatFile extends AlignFile // FileParse int codonStart = 1; // from CDS /codon_start - String proteinName; // from CDS /product; TODO: use for protein description + String proteinName; // from CDS /product; used for protein description String proteinId; // from CDS /protein_id - Map cdsProps = new Hashtable<>(); // CDS other qualifiers + Map cdsProps = new Hashtable<>(); // CDS other qualifiers } private static final String WHITESPACE = "\\s+"; @@ -69,6 +70,8 @@ public class EmblFlatFile extends AlignFile // FileParse private String version; // from ID (second token) + private String description; // from (first) DE line + private int length = 128; // from ID (7th token), with usable default private List dbrefs; // from DR and also CDS /db_xref qualifiers @@ -76,7 +79,7 @@ public class EmblFlatFile extends AlignFile // FileParse private String sequenceString; // from SQ lines private List cds; - + /** * Constructor * @@ -107,6 +110,10 @@ public class EmblFlatFile extends AlignFile // FileParse { line = parseID(line); } + else if (line.startsWith("DE")) + { + line = parseDE(line); + } else if (line.startsWith("DR")) { line = parseDR(line); @@ -180,6 +187,38 @@ public class EmblFlatFile extends AlignFile // FileParse } /** + * Reads sequence description from the first DE line found. Any trailing + * period is discarded. If there are multiple DE lines, only the first (short + * description) is read, the rest are ignored. + * + * @param line + * @return + * @throws IOException + */ + String parseDE(String line) throws IOException + { + String desc = line.substring(2).trim(); + if (desc.endsWith(".")) + { + desc = desc.substring(0, desc.length() - 1); + } + this.description = desc; + + /* + * pass over any additional DE lines + */ + while ((line = nextLine()) != null) + { + if (!line.startsWith("DE")) + { + break; + } + } + + return line; + } + + /** * Processes one DR line and saves as a DBRefEntry cross-reference. Returns * the line following the line processed. * @@ -208,7 +247,7 @@ public class EmblFlatFile extends AlignFile // FileParse if (!secondaryId.isEmpty()) { // todo: is this right? secondary id is not a version number - // version = secondaryId; + // version = secondaryId; } } this.dbrefs.add(new DBRefEntry(db, version, acc)); @@ -270,7 +309,7 @@ public class EmblFlatFile extends AlignFile // FileParse data.cdsLocation = tokens[2]; line = nextLine(); - while (line != null) + while (line != null) { if (!line.startsWith("FT ")) // 4 spaces { @@ -348,7 +387,7 @@ public class EmblFlatFile extends AlignFile // FileParse } this.cds.add(data); - + return line; } @@ -358,7 +397,7 @@ public class EmblFlatFile extends AlignFile // FileParse * * @param value * the first line of the translation (likely quoted) - * @param data + * @param data * @return * @throws IOException */ @@ -388,11 +427,63 @@ public class EmblFlatFile extends AlignFile // FileParse } data.translation = sb.toString(); - + return line; } /** + * Constructs and saves the sequence from parsed components + */ + void assembleSequence() + { + String name = this.accession; + if (this.sourceDb != null) + { + name = this.sourceDb + "|" + name; + } + SequenceI seq = new Sequence(name, this.sequenceString); + seq.setDescription(this.description); + + /* + * add a DBRef to itself + */ + DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession); + int[] startEnd = new int[] { 1, seq.getLength() }; + selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1)); + seq.addDBRef(selfRef); + + for (DBRefEntry dbref : this.dbrefs) + { + seq.addDBRef(dbref); + } + + processAllCDS(seq); + + seq.deriveSequence(); + + addSequence(seq); + } + + /** + * Process the CDS features, including generation of cross-references and + * mappings to the protein products (translation) + * + * @param seq + */ + protected void processAllCDS(SequenceI seq) + { + /* + * record protein products found to avoid duplication i.e. >1 CDS with + * the same /protein_id [though not sure I can find an example of this] + */ + Map proteins = new HashMap<>(); + for (CdsData data : cds) + { + processOneCDS(seq, data, proteins); + } + } + + /** * Processes the parsed CDS feature data to *
    *
  • add a CDS feature to the sequence for each CDS start-end range
  • @@ -403,8 +494,11 @@ public class EmblFlatFile extends AlignFile // FileParse * * @param SequenceI * dna + * @param proteins + * map of protein products so far derived from CDS data */ - void processCDS(SequenceI dna, CdsData data) + void processOneCDS(SequenceI dna, CdsData data, + Map proteins) { /* * parse location into a list of [start, end, start, end] positions @@ -437,54 +531,41 @@ public class EmblFlatFile extends AlignFile // FileParse sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName); dna.addSequenceFeature(sf); - } - } - - /** - * Constructs a sequence for the protein product (if there is one), and dbrefs - * with mappings from dna to protein and the reverse - */ - void processTranslation() - { - // TODO Auto-generated method stub + linkProteinProduct(dna, data, proteins); + } } /** - * Constructs and saves the sequence from parsed components + * Constructs a sequence for the protein product for the CDS data (if there is + * one), and dbrefs with mappings from CDS to protein and the reverse + * + * @param dna + * @param data + * @param proteins */ - void assembleSequence() + void linkProteinProduct(SequenceI dna, CdsData data, Map proteins) { - String name = this.accession; - if (this.sourceDb != null) + /* + * check we have some data to work with + */ + if (data.proteinId == null || data.translation == null) { - name = this.sourceDb + "|" + name; + return; } - SequenceI seq = new Sequence(name, this.sequenceString); - for (DBRefEntry dbref : this.dbrefs) + + /* + * Construct the protein sequence (if not already seen) + */ + SequenceI protein = proteins.get(data.proteinId); + if (protein == null) { - seq.addDBRef(dbref); + protein = new Sequence(data.proteinId, data.translation, 1, + data.translation.length()); + protein.setDescription(data.proteinName != null ? data.proteinName + : "Protein Product from " + sourceDb); + proteins.put(data.proteinId, protein); } - - for (CdsData data : cds) - { - processCDS(seq, data); - }; - - processTranslation(); - - seq.deriveSequence(); - - addSequence(seq); - } - - /** - * Output (print) is not implemented for EMBL flat file format - */ - @Override - public String print(SequenceI[] seqs, boolean jvsuffix) - { - return null; } /** @@ -514,4 +595,13 @@ public class EmblFlatFile extends AlignFile // FileParse return new int[] {}; } } + + /** + * Output (print) is not implemented for EMBL flat file format + */ + @Override + public String print(SequenceI[] seqs, boolean jvsuffix) + { + return null; + } } diff --git a/test/jalview/io/EmblFlatFileTest.java b/test/jalview/io/EmblFlatFileTest.java index 6d9874e..b1023d1 100644 --- a/test/jalview/io/EmblFlatFileTest.java +++ b/test/jalview/io/EmblFlatFileTest.java @@ -2,7 +2,6 @@ package jalview.io; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; -import static org.testng.Assert.assertNull; import java.io.File; import java.io.IOException; @@ -13,6 +12,7 @@ import java.util.Set; import org.testng.annotations.Test; import jalview.datamodel.DBRefEntry; +import jalview.datamodel.Mapping; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.datamodel.features.SequenceFeatures; @@ -39,6 +39,7 @@ public class EmblFlatFileTest SequenceI seq = seqs.get(0); assertEquals(seq.getName(), "EmblTest|J03321"); assertEquals(seq.getLength(), 7502); + assertEquals(seq.getDescription(), "Chlamydia trachomatis plasmid pCHL1, complete sequence"); /* * should be 9 CDS features (one is a 'join' of two exons) @@ -48,7 +49,7 @@ public class EmblFlatFileTest assertTrue(featureTypes.contains("CDS")); /* - * inspect some features (sort them for convenience of test assertions) + * inspect some features (sorted just for convenience of test assertions) */ List features = seq.getFeatures() .getAllFeatures("CDS"); @@ -65,7 +66,7 @@ public class EmblFlatFileTest assertEquals(sf.getPhase(), "0"); assertEquals(sf.getStrand(), 1); assertEquals(sf.getValue("note"), "pGP7-D"); - // second exon of circular DNA! + // this is the second exon of circular CDS! assertEquals(sf.getValue("exon number"), 2); assertEquals(sf.getValue("product"), "hypothetical protein"); assertEquals(sf.getValue("transl_table"), "11"); @@ -97,7 +98,7 @@ public class EmblFlatFileTest assertEquals(sf.getValue("product"), "hypothetical protein"); /* - * CDS at 7022-7502 is the first exon of the circular DNA CDS + * CDS at 7022-7502 is the first exon of the circular CDS */ sf = features.get(8); assertEquals(sf.getBegin(), 7022); @@ -113,18 +114,25 @@ public class EmblFlatFileTest assertEquals(sf.getValue("product"), "hypothetical protein"); /* - * there are 4 'direct' (DR) dbrefs, and numerous CDS /db_xref entries, - * some of them (e.g. INTERPRO) duplicates; sample a few here + * Jalview adds a dbref to 'self', and there are 4 'direct' (DR) dbrefs, + * and numerous CDS /db_xref entries (some e.g. INTERPRO are duplicates) + * sample a few here * Note DBRefEntry constructor capitalises source */ List dbrefs = seq.getDBRefs(); - assertEquals(dbrefs.size(), 31); + assertEquals(dbrefs.size(), 32); + // xref to 'self': + DBRefEntry selfRef = new DBRefEntry("EMBLTEST", "1", "J03321"); + int[] range = new int[] {1, seq.getLength()}; + selfRef.setMap(new Mapping(null, range, range, 1, 1)); + assertTrue(dbrefs.contains(selfRef)); + // 1st DR line; note trailing period is removed assertTrue(dbrefs.contains(new DBRefEntry("MD5", "0", "d4c4942a634e3df4995fd5ac75c26a61"))); // the 4th DR line: assertTrue( - dbrefs.contains(new DBRefEntry("EuropePMC", "0", "PMC87941"))); + dbrefs.contains(new DBRefEntry("EUROPEPMC", "0", "PMC87941"))); // from the first CDS feature; note canonicalisation to "UNIPROT" assertTrue(dbrefs.contains(new DBRefEntry("GOA", "0", "P0CE19"))); assertTrue(dbrefs.contains(new DBRefEntry("UNIPROT", "0", "P0CE19"))); -- 1.7.10.2