JAL-1955 Uniprot sequence fetcher refactor
authortcofoegbu <tcnofoegbu@dundee.ac.uk>
Mon, 2 Nov 2015 14:03:46 +0000 (14:03 +0000)
committertcofoegbu <tcnofoegbu@dundee.ac.uk>
Mon, 2 Nov 2015 14:03:46 +0000 (14:03 +0000)
src/jalview/ws/dbsources/Uniprot.java
test/jalview/ws/dbsources/UniprotTest.java

index 1e8eadb..843828b 100644 (file)
  */
 package jalview.ws.dbsources;
 
+import jalview.datamodel.Alignment;
 import jalview.datamodel.AlignmentI;
 import jalview.datamodel.DBRefEntry;
 import jalview.datamodel.DBRefSource;
 import jalview.datamodel.PDBEntry;
+import jalview.datamodel.Sequence;
 import jalview.datamodel.SequenceFeature;
 import jalview.datamodel.SequenceI;
 import jalview.datamodel.UniprotEntry;
@@ -35,6 +37,7 @@ import jalview.ws.seqfetcher.DbSourceProxyImpl;
 import java.io.File;
 import java.io.FileReader;
 import java.io.Reader;
+import java.util.ArrayList;
 import java.util.Vector;
 
 import org.exolab.castor.xml.Unmarshaller;
@@ -50,8 +53,6 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy
 
   private static final String BAR_DELIMITER = "|";
 
-  private static final String NEWLINE = "\n";
-
   private static org.exolab.castor.mapping.Mapping map;
 
   /**
@@ -62,7 +63,6 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy
     super();
     addDbSourceProperty(DBRefSource.SEQDB, DBRefSource.SEQDB);
     addDbSourceProperty(DBRefSource.PROTSEQDB);
-    // addDbSourceProperty(DBRefSource.MULTIACC, new Integer(50));
   }
 
   /*
@@ -70,9 +70,10 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy
    * 
    * @see jalview.ws.DbSourceProxy#getAccessionSeparator()
    */
+  @Override
   public String getAccessionSeparator()
   {
-    return null; // ";";
+    return null;
   }
 
   /*
@@ -80,6 +81,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy
    * 
    * @see jalview.ws.DbSourceProxy#getAccessionValidator()
    */
+  @Override
   public Regex getAccessionValidator()
   {
     return new Regex("([A-Z]+[0-9]+[A-Z0-9]+|[A-Z0-9]+_[A-Z0-9]+)");
@@ -90,6 +92,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy
    * 
    * @see jalview.ws.DbSourceProxy#getDbSource()
    */
+  @Override
   public String getDbSource()
   {
     return DBRefSource.UNIPROT;
@@ -100,6 +103,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy
    * 
    * @see jalview.ws.DbSourceProxy#getDbVersion()
    */
+  @Override
   public String getDbVersion()
   {
     return "0"; // we really don't know what version we're on.
@@ -148,6 +152,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy
    * 
    * @see jalview.ws.DbSourceProxy#getSequenceRecords(java.lang.String[])
    */
+  @Override
   public AlignmentI getSequenceRecords(String queries) throws Exception
   {
     startQuery();
@@ -165,32 +170,13 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy
 
       if (entries != null)
       {
-        /*
-         * If Castor binding included sequence@length, we could guesstimate the
-         * size of buffer to hold the alignment
-         */
-        StringBuffer result = new StringBuffer(128);
-        // First, make the new sequences
+        ArrayList<SequenceI> seqs = new ArrayList<SequenceI>();
         for (UniprotEntry entry : entries)
         {
-          StringBuilder name = constructSequenceFastaHeader(entry);
-
-          result.append(name).append(NEWLINE)
-                  .append(entry.getUniprotSequence().getContent())
-                  .append(NEWLINE);
+          seqs.add(uniprotEntryToSequenceI(entry));
         }
+        al = new Alignment(seqs.toArray(new SequenceI[0]));
 
-        // Then read in the features and apply them to the dataset
-        al = parseResult(result.toString());
-        if (al != null)
-        {
-          // Decorate the alignment with database entries.
-          addUniprotXrefs(al, entries);
-        }
-        else
-        {
-          results = result;
-        }
       }
       stopQuery();
       return al;
@@ -202,99 +188,95 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy
   }
 
   /**
-   * Construct a Fasta-format sequence header by concatenating the source,
-   * accession id(s) and name(s), delimited by '|', plus any protein names, now
-   * with space rather than bar delimiter
    * 
    * @param entry
-   * @return
+   *          UniprotEntry
+   * @return SequenceI instance created from the UniprotEntry instance
    */
-  public static StringBuilder constructSequenceFastaHeader(
-          UniprotEntry entry)
-  {
-    StringBuilder name = new StringBuilder(32);
-    name.append(">UniProt/Swiss-Prot");
+  public SequenceI uniprotEntryToSequenceI(UniprotEntry entry){
+    String id = getUniprotEntryId(entry);
+    SequenceI sequence = new Sequence(id, entry.getUniprotSequence()
+            .getContent());
+    sequence.setDescription(getUniprotEntryDescription(entry));
+
+    final String dbVersion = getDbVersion();
+    ArrayList<DBRefEntry> dbRefs = new ArrayList<DBRefEntry>();
     for (String accessionId : entry.getAccession())
     {
-      name.append(BAR_DELIMITER);
-      name.append(accessionId);
+      DBRefEntry dbRef = new DBRefEntry(DBRefSource.UNIPROT, dbVersion,
+              accessionId);
+      dbRefs.add(dbRef);
     }
-    for (String n : entry.getName())
+    sequence.setSourceDBRef((dbRefs != null && dbRefs.size() > 0) ? dbRefs
+            .get(0) : null);
+
+    Vector<PDBEntry> onlyPdbEntries = new Vector<PDBEntry>();
+    for (PDBEntry pdb : entry.getDbReference())
     {
-      name.append(BAR_DELIMITER);
-      name.append(n);
+      DBRefEntry dbr = new DBRefEntry();
+      dbr.setSource(pdb.getType());
+      dbr.setAccessionId(pdb.getId());
+      dbr.setVersion(DBRefSource.UNIPROT + ":" + dbVersion);
+      dbRefs.add(dbr);
+      if ("PDB".equals(pdb.getType()))
+      {
+        onlyPdbEntries.addElement(pdb);
+      }
     }
 
-    if (entry.getProtein() != null && entry.getProtein().getName() != null)
+    sequence.setPDBId(onlyPdbEntries);
+    if (entry.getFeature() != null)
     {
-      for (String nm : entry.getProtein().getName())
+      for (SequenceFeature sf : entry.getFeature())
       {
-        name.append(" ").append(nm);
+        sf.setFeatureGroup("Uniprot");
+        sequence.addSequenceFeature(sf);
       }
     }
-    return name;
+    sequence.setDBRefs(dbRefs.toArray(new DBRefEntry[0]));
+    return sequence;
   }
 
   /**
-   * add an ordered set of UniprotEntry objects to an ordered set of seuqences.
    * 
-   * @param al
-   *          - a sequence of n sequences
-   * @param entries
-   *          a list of n uniprot entries to be analysed.
+   * @param entry
+   *          UniportEntry
+   * @return protein name(s) delimited by a white space character
    */
-  public void addUniprotXrefs(AlignmentI al, Vector<UniprotEntry> entries)
+  public static String getUniprotEntryDescription(UniprotEntry entry)
   {
-    final String dbVersion = getDbVersion();
-
-    for (int i = 0; i < entries.size(); i++)
+    StringBuilder desc = new StringBuilder(32);
+    if (entry.getProtein() != null && entry.getProtein().getName() != null)
     {
-      UniprotEntry entry = entries.elementAt(i);
-      Vector<PDBEntry> onlyPdbEntries = new Vector<PDBEntry>();
-      Vector<DBRefEntry> dbxrefs = new Vector<DBRefEntry>();
-
-      for (PDBEntry pdb : entry.getDbReference())
-      {
-        DBRefEntry dbr = new DBRefEntry();
-        dbr.setSource(pdb.getType());
-        dbr.setAccessionId(pdb.getId());
-        dbr.setVersion(DBRefSource.UNIPROT + ":" + dbVersion);
-        dbxrefs.addElement(dbr);
-        if ("PDB".equals(pdb.getType()))
-        {
-          onlyPdbEntries.addElement(pdb);
-        }
-      }
-
-      SequenceI sq = al.getSequenceAt(i);
-      while (sq.getDatasetSequence() != null)
-      {
-        sq = sq.getDatasetSequence();
-      }
-
-      for (String accessionId : entry.getAccession())
+      for (String nm : entry.getProtein().getName())
       {
-        /*
-         * add as uniprot whether retrieved from uniprot or uniprot_name
-         */
-        sq.addDBRef(new DBRefEntry(DBRefSource.UNIPROT, dbVersion,
-                accessionId));
+        desc.append(nm).append(" ");
       }
+    }
+    return desc.toString();
+  }
 
-      for (DBRefEntry dbRef : dbxrefs)
-      {
-        sq.addDBRef(dbRef);
-      }
-      sq.setPDBId(onlyPdbEntries);
-      if (entry.getFeature() != null)
-      {
-        for (SequenceFeature sf : entry.getFeature())
-        {
-          sf.setFeatureGroup("Uniprot");
-          sq.addSequenceFeature(sf);
-        }
-      }
+  /**
+   *
+   * @param entry
+   *          UniportEntry
+   * @return The accession id(s) and name(s) delimited by '|'.
+   */
+  public static String getUniprotEntryId(UniprotEntry entry)
+  {
+    StringBuilder name = new StringBuilder(32);
+    name.append("UniProt/Swiss-Prot");
+    for (String accessionId : entry.getAccession())
+    {
+      name.append(BAR_DELIMITER);
+      name.append(accessionId);
     }
+    for (String n : entry.getName())
+    {
+      name.append(BAR_DELIMITER);
+      name.append(n);
+    }
+    return name.toString();
   }
 
   /*
@@ -302,6 +284,7 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy
    * 
    * @see jalview.ws.DbSourceProxy#isValidReference(java.lang.String)
    */
+  @Override
   public boolean isValidReference(String accession)
   {
     // TODO: make the following a standard validator
@@ -312,11 +295,13 @@ public class Uniprot extends DbSourceProxyImpl implements DbSourceProxy
   /**
    * return LDHA_CHICK uniprot entry
    */
+  @Override
   public String getTestQuery()
   {
     return "P00340";
   }
 
+  @Override
   public String getDbName()
   {
     return "Uniprot"; // getDbSource();
index a92b5c4..7e387bd 100644 (file)
@@ -135,9 +135,13 @@ public class UniprotTest
     Vector<UniprotEntry> entries = u.getUniprotEntries(reader);
     UniprotEntry entry = entries.get(0);
 
-    // source + accession ids + names + protein names
-    String expectedName = ">UniProt/Swiss-Prot|A9CKP4|A9CKP5|A9CKP4_AGRT5|A9CKP4_AGRT6 Mitogen-activated protein kinase 13 Henry";
-    assertEquals(expectedName, Uniprot.constructSequenceFastaHeader(entry)
-            .toString());
+    // source + accession ids + names
+    String expectedName = "UniProt/Swiss-Prot|A9CKP4|A9CKP5|A9CKP4_AGRT5|A9CKP4_AGRT6";
+    // protein names
+    String expectedDescription = "Mitogen-activated protein kinase 13 Henry ";
+
+    assertEquals(expectedName, Uniprot.getUniprotEntryId(entry));
+    assertEquals(expectedDescription,
+            Uniprot.getUniprotEntryDescription(entry));
   }
 }