From bb9e5fdd698bc2ccfbe37293f4d25178c407c811 Mon Sep 17 00:00:00 2001 From: tcofoegbu Date: Mon, 2 Nov 2015 14:03:46 +0000 Subject: [PATCH] JAL-1955 Uniprot sequence fetcher refactor --- src/jalview/ws/dbsources/Uniprot.java | 175 +++++++++++++--------------- test/jalview/ws/dbsources/UniprotTest.java | 12 +- 2 files changed, 88 insertions(+), 99 deletions(-) diff --git a/src/jalview/ws/dbsources/Uniprot.java b/src/jalview/ws/dbsources/Uniprot.java index 1e8eadb..843828b 100644 --- a/src/jalview/ws/dbsources/Uniprot.java +++ b/src/jalview/ws/dbsources/Uniprot.java @@ -20,10 +20,12 @@ */ package jalview.ws.dbsources; +import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.PDBEntry; +import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.datamodel.UniprotEntry; @@ -35,6 +37,7 @@ import jalview.ws.seqfetcher.DbSourceProxyImpl; import java.io.File; import java.io.FileReader; import java.io.Reader; +import java.util.ArrayList; import java.util.Vector; import org.exolab.castor.xml.Unmarshaller; @@ -50,8 +53,6 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy private static final String BAR_DELIMITER = "|"; - private static final String NEWLINE = "\n"; - private static org.exolab.castor.mapping.Mapping map; /** @@ -62,7 +63,6 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy super(); addDbSourceProperty(DBRefSource.SEQDB, DBRefSource.SEQDB); addDbSourceProperty(DBRefSource.PROTSEQDB); - // addDbSourceProperty(DBRefSource.MULTIACC, new Integer(50)); } /* @@ -70,9 +70,10 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#getAccessionSeparator() */ + @Override public String getAccessionSeparator() { - return null; // ";"; + return null; } /* @@ -80,6 +81,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#getAccessionValidator() */ + @Override public Regex getAccessionValidator() { return new Regex("([A-Z]+[0-9]+[A-Z0-9]+|[A-Z0-9]+_[A-Z0-9]+)"); @@ -90,6 +92,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#getDbSource() */ + @Override public String getDbSource() { return DBRefSource.UNIPROT; @@ -100,6 +103,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#getDbVersion() */ + @Override public String getDbVersion() { return "0"; // we really don't know what version we're on. @@ -148,6 +152,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#getSequenceRecords(java.lang.String[]) */ + @Override public AlignmentI getSequenceRecords(String queries) throws Exception { startQuery(); @@ -165,32 +170,13 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy if (entries != null) { - /* - * If Castor binding included sequence@length, we could guesstimate the - * size of buffer to hold the alignment - */ - StringBuffer result = new StringBuffer(128); - // First, make the new sequences + ArrayList seqs = new ArrayList(); for (UniprotEntry entry : entries) { - StringBuilder name = constructSequenceFastaHeader(entry); - - result.append(name).append(NEWLINE) - .append(entry.getUniprotSequence().getContent()) - .append(NEWLINE); + seqs.add(uniprotEntryToSequenceI(entry)); } + al = new Alignment(seqs.toArray(new SequenceI[0])); - // Then read in the features and apply them to the dataset - al = parseResult(result.toString()); - if (al != null) - { - // Decorate the alignment with database entries. - addUniprotXrefs(al, entries); - } - else - { - results = result; - } } stopQuery(); return al; @@ -202,99 +188,95 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy } /** - * Construct a Fasta-format sequence header by concatenating the source, - * accession id(s) and name(s), delimited by '|', plus any protein names, now - * with space rather than bar delimiter * * @param entry - * @return + * UniprotEntry + * @return SequenceI instance created from the UniprotEntry instance */ - public static StringBuilder constructSequenceFastaHeader( - UniprotEntry entry) - { - StringBuilder name = new StringBuilder(32); - name.append(">UniProt/Swiss-Prot"); + public SequenceI uniprotEntryToSequenceI(UniprotEntry entry){ + String id = getUniprotEntryId(entry); + SequenceI sequence = new Sequence(id, entry.getUniprotSequence() + .getContent()); + sequence.setDescription(getUniprotEntryDescription(entry)); + + final String dbVersion = getDbVersion(); + ArrayList dbRefs = new ArrayList(); for (String accessionId : entry.getAccession()) { - name.append(BAR_DELIMITER); - name.append(accessionId); + DBRefEntry dbRef = new DBRefEntry(DBRefSource.UNIPROT, dbVersion, + accessionId); + dbRefs.add(dbRef); } - for (String n : entry.getName()) + sequence.setSourceDBRef((dbRefs != null && dbRefs.size() > 0) ? dbRefs + .get(0) : null); + + Vector onlyPdbEntries = new Vector(); + for (PDBEntry pdb : entry.getDbReference()) { - name.append(BAR_DELIMITER); - name.append(n); + DBRefEntry dbr = new DBRefEntry(); + dbr.setSource(pdb.getType()); + dbr.setAccessionId(pdb.getId()); + dbr.setVersion(DBRefSource.UNIPROT + ":" + dbVersion); + dbRefs.add(dbr); + if ("PDB".equals(pdb.getType())) + { + onlyPdbEntries.addElement(pdb); + } } - if (entry.getProtein() != null && entry.getProtein().getName() != null) + sequence.setPDBId(onlyPdbEntries); + if (entry.getFeature() != null) { - for (String nm : entry.getProtein().getName()) + for (SequenceFeature sf : entry.getFeature()) { - name.append(" ").append(nm); + sf.setFeatureGroup("Uniprot"); + sequence.addSequenceFeature(sf); } } - return name; + sequence.setDBRefs(dbRefs.toArray(new DBRefEntry[0])); + return sequence; } /** - * add an ordered set of UniprotEntry objects to an ordered set of seuqences. * - * @param al - * - a sequence of n sequences - * @param entries - * a list of n uniprot entries to be analysed. + * @param entry + * UniportEntry + * @return protein name(s) delimited by a white space character */ - public void addUniprotXrefs(AlignmentI al, Vector entries) + public static String getUniprotEntryDescription(UniprotEntry entry) { - final String dbVersion = getDbVersion(); - - for (int i = 0; i < entries.size(); i++) + StringBuilder desc = new StringBuilder(32); + if (entry.getProtein() != null && entry.getProtein().getName() != null) { - UniprotEntry entry = entries.elementAt(i); - Vector onlyPdbEntries = new Vector(); - Vector dbxrefs = new Vector(); - - for (PDBEntry pdb : entry.getDbReference()) - { - DBRefEntry dbr = new DBRefEntry(); - dbr.setSource(pdb.getType()); - dbr.setAccessionId(pdb.getId()); - dbr.setVersion(DBRefSource.UNIPROT + ":" + dbVersion); - dbxrefs.addElement(dbr); - if ("PDB".equals(pdb.getType())) - { - onlyPdbEntries.addElement(pdb); - } - } - - SequenceI sq = al.getSequenceAt(i); - while (sq.getDatasetSequence() != null) - { - sq = sq.getDatasetSequence(); - } - - for (String accessionId : entry.getAccession()) + for (String nm : entry.getProtein().getName()) { - /* - * add as uniprot whether retrieved from uniprot or uniprot_name - */ - sq.addDBRef(new DBRefEntry(DBRefSource.UNIPROT, dbVersion, - accessionId)); + desc.append(nm).append(" "); } + } + return desc.toString(); + } - for (DBRefEntry dbRef : dbxrefs) - { - sq.addDBRef(dbRef); - } - sq.setPDBId(onlyPdbEntries); - if (entry.getFeature() != null) - { - for (SequenceFeature sf : entry.getFeature()) - { - sf.setFeatureGroup("Uniprot"); - sq.addSequenceFeature(sf); - } - } + /** + * + * @param entry + * UniportEntry + * @return The accession id(s) and name(s) delimited by '|'. + */ + public static String getUniprotEntryId(UniprotEntry entry) + { + StringBuilder name = new StringBuilder(32); + name.append("UniProt/Swiss-Prot"); + for (String accessionId : entry.getAccession()) + { + name.append(BAR_DELIMITER); + name.append(accessionId); } + for (String n : entry.getName()) + { + name.append(BAR_DELIMITER); + name.append(n); + } + return name.toString(); } /* @@ -302,6 +284,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy * * @see jalview.ws.DbSourceProxy#isValidReference(java.lang.String) */ + @Override public boolean isValidReference(String accession) { // TODO: make the following a standard validator @@ -312,11 +295,13 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy /** * return LDHA_CHICK uniprot entry */ + @Override public String getTestQuery() { return "P00340"; } + @Override public String getDbName() { return "Uniprot"; // getDbSource(); diff --git a/test/jalview/ws/dbsources/UniprotTest.java b/test/jalview/ws/dbsources/UniprotTest.java index a92b5c4..7e387bd 100644 --- a/test/jalview/ws/dbsources/UniprotTest.java +++ b/test/jalview/ws/dbsources/UniprotTest.java @@ -135,9 +135,13 @@ public class UniprotTest Vector entries = u.getUniprotEntries(reader); UniprotEntry entry = entries.get(0); - // source + accession ids + names + protein names - String expectedName = ">UniProt/Swiss-Prot|A9CKP4|A9CKP5|A9CKP4_AGRT5|A9CKP4_AGRT6 Mitogen-activated protein kinase 13 Henry"; - assertEquals(expectedName, Uniprot.constructSequenceFastaHeader(entry) - .toString()); + // source + accession ids + names + String expectedName = "UniProt/Swiss-Prot|A9CKP4|A9CKP5|A9CKP4_AGRT5|A9CKP4_AGRT6"; + // protein names + String expectedDescription = "Mitogen-activated protein kinase 13 Henry "; + + assertEquals(expectedName, Uniprot.getUniprotEntryId(entry)); + assertEquals(expectedDescription, + Uniprot.getUniprotEntryDescription(entry)); } } -- 1.7.10.2