From: Jim Procter Date: Thu, 23 Sep 2021 16:27:46 +0000 (+0100) Subject: Merge branch 'develop' into releases/Release_2_11_2_Branch X-Git-Tag: Release_2_11_2_0~32 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=8862de9ceb93444baadee230855fbbf1d0c62351;hp=acd9913f415ff09bdac6739982d2fb4752b46b8e;p=jalview.git Merge branch 'develop' into releases/Release_2_11_2_Branch --- diff --git a/src/jalview/io/FlatFile.java b/src/jalview/io/EMBLLikeFlatFile.java similarity index 91% rename from src/jalview/io/FlatFile.java rename to src/jalview/io/EMBLLikeFlatFile.java index 55fdd37..64943b2 100644 --- a/src/jalview/io/FlatFile.java +++ b/src/jalview/io/EMBLLikeFlatFile.java @@ -7,6 +7,7 @@ import java.util.Arrays; import java.util.HashMap; import java.util.Hashtable; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Map.Entry; import java.util.TreeMap; @@ -44,7 +45,7 @@ import jalview.util.MappingUtils; * each line is formatted differently in GenBank and EMBL. See * http://www.insdc.org/files/feature_table.html#7.1. */ -public abstract class FlatFile extends AlignFile +public abstract class EMBLLikeFlatFile extends AlignFile { protected static final String LOCATION = "location"; @@ -159,6 +160,13 @@ public abstract class FlatFile extends AlignFile } /* + * when true, interpret the mol_type 'source' feature attribute + * and generate an RNA sequence from the DNA record + */ + protected boolean produceRna=true; + + + /* * values parsed from the data file */ protected String sourceDb; @@ -173,6 +181,8 @@ public abstract class FlatFile extends AlignFile protected List dbrefs; + protected boolean sequenceStringIsRNA=false; + protected String sequenceString; protected Map cds; @@ -184,7 +194,7 @@ public abstract class FlatFile extends AlignFile * @param sourceId * @throws IOException */ - public FlatFile(FileParse fp, String sourceId) throws IOException + public EMBLLikeFlatFile(FileParse fp, String sourceId) throws IOException { super(false, fp); // don't parse immediately this.sourceDb = sourceId; @@ -199,6 +209,52 @@ public abstract class FlatFile extends AlignFile } /** + * process attributes for 'source' until the next FT feature entry + * only interested in 'mol_type' + * @param tokens + * @return + * @throws IOException + */ + private String parseSourceQualifiers(String[] tokens) throws IOException + { + if (!"source".equals(tokens[0])) + { + throw (new RuntimeException("Not given a 'source' qualifier line")); + } + // search for mol_type attribute + + StringBuilder sb = new StringBuilder().append(tokens[1]); // extent of + // sequence + + String line = parseFeatureQualifier(sb, false); + while (line != null) + { + if (!line.startsWith("FT ")) // four spaces, end of this feature table + // entry + { + return line; + } + + // case sensitive ? + int p = line.indexOf("\\mol_type"); + int qs = line.indexOf("\"", p); + int qe = line.indexOf("\"", qs + 1); + String qualifier=line.substring(qs,qe).toLowerCase(Locale.ROOT); + if (qualifier.indexOf("rna") > -1) + { + sequenceStringIsRNA = true; + } + if (qualifier.indexOf("dna") > -1) + { + sequenceStringIsRNA = false; + } + line=parseFeatureQualifier(sb, false); + } + return line; + } + + + /** * Parses one (GenBank or EMBL format) CDS feature, saves the parsed data, and * returns the next line * @@ -335,6 +391,12 @@ public abstract class FlatFile extends AlignFile { name = this.sourceDb + "|" + name; } + + if (produceRna && sequenceStringIsRNA) + { + sequenceString = sequenceString.replace('T', 'U').replace('t', 'u'); + } + SequenceI seq = new Sequence(name, this.sequenceString); seq.setDescription(this.description); @@ -738,11 +800,14 @@ public abstract class FlatFile extends AlignFile protected String parseFeature(String line) throws IOException { String[] tokens = line.trim().split(WHITESPACE); - if (tokens.length < 2 || !"CDS".equals(tokens[0])) + if (tokens.length < 2 || (!"CDS".equals(tokens[0]) && (!"source".equals(tokens[0])))) { return nextLine(); } - + if (tokens[0].equals("source")) + { + return parseSourceQualifiers(tokens); + } return parseCDSFeature(tokens[1]); } } diff --git a/src/jalview/io/EmblFlatFile.java b/src/jalview/io/EmblFlatFile.java index 19496ef..7808d1a 100644 --- a/src/jalview/io/EmblFlatFile.java +++ b/src/jalview/io/EmblFlatFile.java @@ -25,7 +25,7 @@ import jalview.util.DBRefUtils; * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html */ -public class EmblFlatFile extends FlatFile +public class EmblFlatFile extends EMBLLikeFlatFile { /** * Constructor given a data source and the id of the source database diff --git a/src/jalview/io/GenBankFile.java b/src/jalview/io/GenBankFile.java index ba7b4b4..f1ca0e3 100644 --- a/src/jalview/io/GenBankFile.java +++ b/src/jalview/io/GenBankFile.java @@ -16,7 +16,7 @@ import java.io.IOException; * @author gmcarstairs * @see https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html */ -public class GenBankFile extends FlatFile +public class GenBankFile extends EMBLLikeFlatFile { private static final String DEFINITION = "DEFINITION"; diff --git a/src/jalview/ws/dbsources/EmblFlatfileSource.java b/src/jalview/ws/dbsources/EmblFlatfileSource.java index 7d3c6dd..2058800 100644 --- a/src/jalview/ws/dbsources/EmblFlatfileSource.java +++ b/src/jalview/ws/dbsources/EmblFlatfileSource.java @@ -97,7 +97,6 @@ public abstract class EmblFlatfileSource extends EbiFileRetrievedProxy file = reply.getAbsolutePath(); FileParse fp = new FileParse(file, DataSourceType.FILE); EmblFlatFile emblParser = new EmblFlatFile(fp, getDbSource()); - emblParser.parse(); SequenceI[] seqs = emblParser.getSeqsAsArray(); if (seqs.length > 0) { diff --git a/test/jalview/io/EmblFlatFileTest.java b/test/jalview/io/EmblFlatFileTest.java index ee853f3..7775c8f 100644 --- a/test/jalview/io/EmblFlatFileTest.java +++ b/test/jalview/io/EmblFlatFileTest.java @@ -230,6 +230,22 @@ public class EmblFlatFileTest } assertEquals(uniprotCount, 8); } + /** + * A fairly tough test, using J03321 (circular DNA), which has 8 CDS features, + * one of them reverse strand + * + * @throws MalformedURLException + * @throws IOException + */ + @Test(groups = "Functional") + public void testParseToRNA() throws MalformedURLException, IOException + { + File dataFile = new File("test/jalview/io/J03321_rna.embl.txt"); + FileParse fp = new FileParse(dataFile, DataSourceType.FILE); + EmblFlatFile parser = new EmblFlatFile(fp, "EmblTest"); + List seqs = parser.getSeqs(); + assertTrue(seqs.get(0).getSequenceAsString().indexOf("u")>-1); + } @Test(groups = "Functional") public void testParse_codonStartNot1() diff --git a/test/jalview/io/GenBankFileTest.java b/test/jalview/io/GenBankFileTest.java index 89f0d0e..97e4754 100644 --- a/test/jalview/io/GenBankFileTest.java +++ b/test/jalview/io/GenBankFileTest.java @@ -42,7 +42,7 @@ public class GenBankFileTest File dataFile = new File("test/jalview/io/J03321.gb"); FileParse fp = new FileParse(dataFile.getAbsolutePath(), DataSourceType.FILE); - FlatFile parser = new GenBankFile(fp, "GenBankTest"); + EMBLLikeFlatFile parser = new GenBankFile(fp, "GenBankTest"); List seqs = parser.getSeqs(); assertEquals(seqs.size(), 1);