From 56d72101b0584635cf539d5413db27abc8deb575 Mon Sep 17 00:00:00 2001 From: James Procter Date: Wed, 15 Nov 2023 14:31:08 +0000 Subject: [PATCH] JAL-4090 JAL-4334 position status when start, end or pos is unknown is stored in start_status,end_status,pos_status properties --- src/jalview/ws/dbsources/Uniprot.java | 58 ++- test/jalview/ws/dbsources/UniprotTest.java | 610 ++++++++++++++-------------- 2 files changed, 365 insertions(+), 303 deletions(-) diff --git a/src/jalview/ws/dbsources/Uniprot.java b/src/jalview/ws/dbsources/Uniprot.java index c9db7f2..b125ba5 100644 --- a/src/jalview/ws/dbsources/Uniprot.java +++ b/src/jalview/ws/dbsources/Uniprot.java @@ -39,6 +39,7 @@ import javax.xml.stream.XMLStreamReader; import com.stevesoft.pat.Regex; import jalview.bin.Cache; +import jalview.bin.Console; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; @@ -292,19 +293,68 @@ public class Uniprot extends DbSourceProxyImpl LocationType location = uf.getLocation(); int start = 0; int end = 0; + String uncertain_start = null, uncertain_end = null, + uncertain_pos = null; if (location.getPosition() != null) { - start = location.getPosition().getPosition().intValue(); - end = start; + if (location.getPosition().getPosition() == null + || !"unknown".equals(location.getPosition().getStatus())) + { + Console.warn( + "Ignoring single position feature with uncertain location " + + uf.getType() + ":" + getDescription(uf)); + uncertain_pos = location.getPosition().getStatus() == null + ? "unknown" + : location.getPosition().getStatus(); + } + else + { + start = location.getPosition().getPosition().intValue(); + end = start; + } } else { - start = location.getBegin().getPosition().intValue(); - end = location.getEnd().getPosition().intValue(); + if (location.getBegin().getPosition() == null) + { + Console.warn( + "Setting start position of feature with uncertain start to 1: " + + uf.getType() + ":" + getDescription(uf)); + start = sequence.getStart(); + uncertain_start = location.getBegin().getStatus(); + } + else + { + start = location.getBegin().getPosition().intValue(); + } + if (location.getEnd().getPosition() == null) + { + Console.warn( + "Setting start position of feature with uncertain start to 1: " + + uf.getType() + ":" + getDescription(uf)); + end = sequence.getEnd(); + uncertain_end = location.getEnd().getStatus(); + } + else + { + end = location.getEnd().getPosition().intValue(); + } } SequenceFeature sf = new SequenceFeature(uf.getType(), getDescription(uf), start, end, "Uniprot"); sf.setStatus(uf.getStatus()); + if (uncertain_end != null) + { + sf.setValue("end_status", uncertain_end); + } + if (uncertain_start != null) + { + sf.setValue("start_status", uncertain_start); + } + if (uncertain_pos != null) + { + sf.setValue("pos_status", uncertain_pos); + } sequence.addSequenceFeature(sf); } } diff --git a/test/jalview/ws/dbsources/UniprotTest.java b/test/jalview/ws/dbsources/UniprotTest.java index 48f1f43..f2d3b66 100644 --- a/test/jalview/ws/dbsources/UniprotTest.java +++ b/test/jalview/ws/dbsources/UniprotTest.java @@ -39,6 +39,7 @@ import org.testng.annotations.Test; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; +import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.gui.JvOptionPane; import jalview.util.DBRefUtils; @@ -326,311 +327,304 @@ public class UniprotTest assertEquals("Hello", Uniprot.getDescription(ft)); } + public static String Q29079 = Q29079 = new String( + "\n" + + "\n" + + "Q29079\n" + + "Q29017\n" + + "PAG2_PIG\n" + "\n" + + "\n" + + "Pregnancy-associated glycoprotein 2\n" + + "PAG 2\n" + + "3.4.23.-\n" + + "\n" + "\n" + "\n" + + "PAG2\n" + "\n" + + "\n" + + "Sus scrofa\n" + + "Pig\n" + + "\n" + + "\n" + "Eukaryota\n" + + "Metazoa\n" + "Chordata\n" + + "Craniata\n" + + "Vertebrata\n" + + "Euteleostomi\n" + + "Mammalia\n" + + "Eutheria\n" + + "Laurasiatheria\n" + + "Artiodactyla\n" + + "Suina\n" + "Suidae\n" + + "Sus\n" + "\n" + + "\n" + "\n" + + "\n" + + "Porcine pregnancy-associated glycoproteins: new members of the aspartic proteinase gene family expressed in trophectoderm.\n" + + "\n" + "\n" + + "\n" + + "\n" + + "\n" + "\n" + + "\n" + + "\n" + + "\n" + + "NUCLEOTIDE SEQUENCE [GENOMIC DNA]\n" + + "\n" + "\n" + + "\n" + + "Gene for porcine pregnancy-associated glycoprotein 2 (poPAG2): its structural organization and analysis of its promoter.\n" + + "\n" + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + + "\n" + + "\n" + + "\n" + + "NUCLEOTIDE SEQUENCE [GENOMIC DNA]\n" + + "\n" + "Placenta\n" + + "\n" + "\n" + + "\n" + + "\n" + + "Secreted\n" + + "Extracellular space\n" + + "\n" + "\n" + + "\n" + + "Expressed throughout the chorion, with the signal localized exclusively over the trophectoderm.\n" + + "\n" + + "\n" + + "Expression was detected at day 15, coinciding with the beginning of implantation, and continued throughout gestation.\n" + + "\n" + "\n" + + "Belongs to the peptidase A1 family.\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "Aspartyl protease\n" + + "Disulfide bond\n" + + "Glycoprotein\n" + + "Hydrolase\n" + + "Protease\n" + + "Reference proteome\n" + + "Secreted\n" + + "Signal\n" + + "Zymogen\n" + + "\n" + + "\n" + "\n" + + "\n" + "\n" + + "\n" + + "\n" + + "\n" + "\n" + + "\n" + "\n" + + "\n" + + "\n" + + "\n" + "\n" + + "\n" + "\n" + + "\n" + + "\n" + + "\n" + "\n" + + "\n" + "\n" + + "\n" + + "\n" + + "\n" + "\n" + + "\n" + "\n" + + "\n" + + "\n" + "\n" + + "\n" + "\n" + + "\n" + + "\n" + "\n" + + "\n" + "\n" + + "\n" + + "\n" + "\n" + + "\n" + "\n" + + "\n" + + "\n" + "\n" + + "\n" + "\n" + + "\n" + + "\n" + + "\n" + "\n" + + "\n" + "\n" + + "\n" + + "\n" + + "\n" + "\n" + + "\n" + "\n" + + "\n" + + "\n" + + "\n" + "\n" + + "\n" + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + + "\n" + + "MKWLVILGLVALSDCLVMIPLTKVKSVRESLREKGLLKNFLKEHPYNMIQNLLSKNSSHVQKFSYQPLRNYLDMVYVGNISIGTPPQQFSVVFDTGSSDLWVPSIYCKSKACVTHRSFNPSHSSTFHDRGKSIKLEYGSGKMSGFLGQDTVRIGQLTSTGQAFGLSKEETGKAFEHAIFDGILGLAYPSIAIKGTTTVIDNLKKQDQISEPVFAFYLSSDKEEGSVVMFGGVDKKYYKGDLKWVPLTQTSYWQIALDRITCRGRVIGCPRGCQAIVDTGTSMLHGPSKAVAKIHSLIKHFEKEYVVPCNARKALPDIVFTINNVDYPVPAQAYIRKYVVPCNARKALPDIVFTINNVDYPVPAQAYIRKNANNNRCYSTFEDIMDTLNQREIWILGDVFLRLYFTVYDEGQNRIGLAQAT\n" + + "\n" + + " Copyrighted by the UniProt Consortium, see https://www.uniprot.org/terms Distributed under the Creative Commons Attribution (CC BY 4.0) License \n" + + ""); + @DataProvider public Object[][] problemEntries() { - return new Object[][] { - new Object[] - { new String( - "\n" - + "\n" - + "Q29079\n" - + "Q29017\n" - + "PAG2_PIG\n" + "\n" - + "\n" - + "Pregnancy-associated glycoprotein 2\n" - + "PAG 2\n" - + "3.4.23.-\n" - + "\n" + "\n" - + "\n" - + "PAG2\n" - + "\n" + "\n" - + "Sus scrofa\n" - + "Pig\n" - + "\n" - + "\n" + "Eukaryota\n" - + "Metazoa\n" - + "Chordata\n" - + "Craniata\n" - + "Vertebrata\n" - + "Euteleostomi\n" - + "Mammalia\n" - + "Eutheria\n" - + "Laurasiatheria\n" - + "Artiodactyla\n" - + "Suina\n" - + "Suidae\n" + "Sus\n" - + "\n" + "\n" - + "\n" - + "\n" - + "Porcine pregnancy-associated glycoproteins: new members of the aspartic proteinase gene family expressed in trophectoderm.\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "NUCLEOTIDE SEQUENCE [GENOMIC DNA]\n" - + "\n" + "\n" - + "\n" - + "Gene for porcine pregnancy-associated glycoprotein 2 (poPAG2): its structural organization and analysis of its promoter.\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "NUCLEOTIDE SEQUENCE [GENOMIC DNA]\n" - + "\n" + "Placenta\n" - + "\n" + "\n" - + "\n" - + "\n" - + "Secreted\n" - + "Extracellular space\n" - + "\n" + "\n" - + "\n" - + "Expressed throughout the chorion, with the signal localized exclusively over the trophectoderm.\n" - + "\n" - + "\n" - + "Expression was detected at day 15, coinciding with the beginning of implantation, and continued throughout gestation.\n" - + "\n" + "\n" - + "Belongs to the peptidase A1 family.\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "Aspartyl protease\n" - + "Disulfide bond\n" - + "Glycoprotein\n" - + "Hydrolase\n" - + "Protease\n" - + "Reference proteome\n" - + "Secreted\n" - + "Signal\n" - + "Zymogen\n" - + "\n" - + "\n" + "\n" - + "\n" + "\n" - + "\n" - + "\n" - + "\n" + "\n" - + "\n" + "\n" - + "\n" - + "\n" - + "\n" + "\n" - + "\n" + "\n" - + "\n" - + "\n" - + "\n" + "\n" - + "\n" + "\n" - + "\n" - + "\n" - + "\n" + "\n" - + "\n" + "\n" - + "\n" - + "\n" + "\n" - + "\n" + "\n" - + "\n" - + "\n" + "\n" - + "\n" + "\n" - + "\n" - + "\n" + "\n" - + "\n" + "\n" - + "\n" - + "\n" + "\n" - + "\n" + "\n" - + "\n" - + "\n" - + "\n" + "\n" - + "\n" + "\n" - + "\n" - + "\n" - + "\n" + "\n" - + "\n" + "\n" - + "\n" - + "\n" - + "\n" + "\n" - + "\n" + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" - + "\n" + "\n" - + "\n" - + "\n" - + "\n" - + "\n" + "\n" - + "\n" - + "MKWLVILGLVALSDCLVMIPLTKVKSVRESLREKGLLKNFLKEHPYNMIQNLLSKNSSHVQKFSYQPLRNYLDMVYVGNISIGTPPQQFSVVFDTGSSDLWVPSIYCKSKACVTHRSFNPSHSSTFHDRGKSIKLEYGSGKMSGFLGQDTVRIGQLTSTGQAFGLSKEETGKAFEHAIFDGILGLAYPSIAIKGTTTVIDNLKKQDQISEPVFAFYLSSDKEEGSVVMFGGVDKKYYKGDLKWVPLTQTSYWQIALDRITCRGRVIGCPRGCQAIVDTGTSMLHGPSKAVAKIHSLIKHFEKEYVVPCNARKALPDIVFTINNVDYPVPAQAYIRKYVVPCNARKALPDIVFTINNVDYPVPAQAYIRKNANNNRCYSTFEDIMDTLNQREIWILGDVFLRLYFTVYDEGQNRIGLAQAT\n" - + "\n" - + " Copyrighted by the UniProt Consortium, see https://www.uniprot.org/terms Distributed under the Creative Commons Attribution (CC BY 4.0) License \n" - + "") } }; + return new Object[][] { new Object[] { Q29079 } }; } @Test(groups = "Functional", dataProvider = "problemEntries") - public void testimportOfProblemEntries(String entry) + public SequenceI testimportOfProblemEntries(String entry) { Uniprot u = new Uniprot(); InputStream is = new ByteArrayInputStream(entry.getBytes()); @@ -638,5 +632,23 @@ public class UniprotTest assertEquals(1, entries.size()); SequenceI sq = u.uniprotEntryToSequence(entries.get(0)); assertNotNull(sq); + return sq; + } + + @Test(groups = "Functional") + public void checkIndefiniteSequenceFeatures() + { + SequenceI upseq = testimportOfProblemEntries(Q29079); + List sf = upseq.getFeatures() + .getPositionalFeatures("chain"); + assertNotNull(sf); + assertTrue(sf.size() == 1); + SequenceFeature chainFeaure = sf.get(0); + assertTrue(chainFeaure.getBegin() == 1); + assertTrue(chainFeaure.getEnd() == upseq.getEnd()); + assertNotNull(chainFeaure.getValueAsString("start_status")); + assertNull(chainFeaure.getValueAsString("end_status")); + assertTrue( + "unknown".equals(chainFeaure.getValueAsString("start_status"))); } } -- 1.7.10.2