From 36952139fb50d6ee1d12ecf69f340986ae9bafcc Mon Sep 17 00:00:00 2001 From: Jim Procter Date: Wed, 17 Feb 2021 17:16:03 +0000 Subject: [PATCH] JAL-3821 patch for 2.11.2 to retrieve RNA ENA records as RNA --- src/jalview/io/EmblFlatFile.java | 66 +++++++++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/src/jalview/io/EmblFlatFile.java b/src/jalview/io/EmblFlatFile.java index 900aef8..ff18a34 100644 --- a/src/jalview/io/EmblFlatFile.java +++ b/src/jalview/io/EmblFlatFile.java @@ -50,6 +50,11 @@ public class EmblFlatFile extends AlignFile // FileParse private static final String DOUBLED_QUOTE = QUOTE + QUOTE; /** + * when true, interpret the mol_type 'source' feature attribute + * and generate an RNA sequence from the DNA record + */ + private boolean produceRna=true; + /** * A data bean class to hold values parsed from one CDS Feature (FT) */ class CdsData @@ -86,6 +91,7 @@ public class EmblFlatFile extends AlignFile // FileParse private List dbrefs; // from DR + private boolean sequenceStringIsRNA=false; private String sequenceString; // from SQ lines /* @@ -317,11 +323,19 @@ public class EmblFlatFile extends AlignFile // FileParse String parseFT(String line) throws IOException { String[] tokens = line.split(WHITESPACE); - if (tokens.length < 3 || !"CDS".equals(tokens[1])) + if (tokens.length < 3 || (!"CDS".equals(tokens[1]) && !"source".equals(tokens[1]))) { return nextLine(); } + + if (tokens[1].equals("source")) + { + return parseSourceQualifiers(tokens); + } + /* + * parse location - which may be over more than one line e.g. EAW51554 + */ CdsData data = new CdsData(); data.cdsLocation = tokens[2]; // TODO location can be over >1 line e.g. EAW51554 @@ -418,6 +432,50 @@ public class EmblFlatFile extends AlignFile // FileParse } /** + * process attributes for 'source' until the next FT feature entry + * only interested in 'mol_type' + * @param tokens + * @return + * @throws IOException + */ + private String parseSourceQualifiers(String[] tokens) throws IOException + { + if (!"source".equals(tokens[1])) + { + throw (new RuntimeException("Not given a source qualifier")); + } + // search for mol_type attribute + + StringBuilder sb = new StringBuilder().append(tokens[2]); // extent of + // sequence + + String line = parseFeatureQualifier(sb, "source"); + while (line != null) + { + if (!line.startsWith("FT ")) // four spaces, end of this feature table + // entry + { + return line; + } + + int p = line.indexOf("\\mol_type"); + int qs = line.indexOf("\"", p); + int qe = line.indexOf("\"", qs + 1); + String qualifier=line.substring(qs,qe).toLowerCase(); + if (qualifier.indexOf("rna") > -1) + { + sequenceStringIsRNA = true; + } + if (qualifier.indexOf("dna") > -1) + { + sequenceStringIsRNA = false; + } + line=parseFeatureQualifier(sb, "source"); + } + return line; + } + + /** * Removes leading or trailing double quotes (") unless doubled, and changes * any 'escaped' (doubled) double quotes to single characters. As per the * Feature Table specification for Qualifiers, Free Text. @@ -524,6 +582,12 @@ public class EmblFlatFile extends AlignFile // FileParse { name = this.sourceDb + "|" + name; } + + if (produceRna && sequenceStringIsRNA) + { + sequenceString = sequenceString.replace('T', 'U').replace('t', 'u'); + } + SequenceI seq = new Sequence(name, this.sequenceString); seq.setDescription(this.description); -- 1.7.10.2