Merge branch 'bug/JAL-3529_uniprotidsinstockholm' into develop
authorJim Procter <jprocter@issues.jalview.org>
Tue, 14 Apr 2020 15:33:50 +0000 (16:33 +0100)
committerJim Procter <jprocter@issues.jalview.org>
Tue, 14 Apr 2020 15:33:50 +0000 (16:33 +0100)
src/jalview/io/StockholmFile.java
src/jalview/util/DBRefUtils.java
test/jalview/io/StockholmFileTest.java
test/jalview/util/DBRefUtilsTest.java

index 0e73af1..4697262 100644 (file)
@@ -28,12 +28,14 @@ import jalview.datamodel.AlignmentAnnotation;
 import jalview.datamodel.AlignmentI;
 import jalview.datamodel.Annotation;
 import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.DBRefSource;
 import jalview.datamodel.Mapping;
 import jalview.datamodel.Sequence;
 import jalview.datamodel.SequenceFeature;
 import jalview.datamodel.SequenceI;
 import jalview.schemes.ResidueProperties;
 import jalview.util.Comparison;
+import jalview.util.DBRefUtils;
 import jalview.util.Format;
 import jalview.util.MessageManager;
 
@@ -332,17 +334,14 @@ public class StockholmFile extends AlignFile
 
           if (accAnnotations != null && accAnnotations.containsKey("AC"))
           {
-            if (dbsource != null)
+            String dbr = (String) accAnnotations.get("AC");
+            if (dbr != null)
             {
-              String dbr = (String) accAnnotations.get("AC");
-              if (dbr != null)
-              {
-                // we could get very clever here - but for now - just try to
-                // guess accession type from source of alignment plus structure
-                // of accession
-                guessDatabaseFor(seqO, dbr, dbsource);
-
-              }
+              // we could get very clever here - but for now - just try to
+              // guess accession type from type of sequence, source of alignment plus
+              // structure
+              // of accession
+              guessDatabaseFor(seqO, dbr, dbsource);
             }
             // else - do what ? add the data anyway and prompt the user to
             // specify what references these are ?
@@ -527,6 +526,9 @@ public class StockholmFile extends AlignFile
               treeName = an.stringMatched(2);
               treeString = new StringBuffer();
             }
+            // TODO: JAL-3532 - this is where GF comments and database references are lost
+            // suggest overriding this method for Stockholm files to catch and properly
+            // process CC, DR etc into multivalued properties
             setAlignmentProperty(an.stringMatched(1), an.stringMatched(2));
           }
         }
@@ -755,6 +757,12 @@ public class StockholmFile extends AlignFile
         st = -1;
       }
     }
+    if (dbsource == null)
+    {
+      // make up an origin based on whether the sequence looks like it is nucleotide
+      // or protein
+      dbsource = (seqO.isProtein()) ? "PFAM" : "RFAM";
+    }
     if (dbsource.equals("PFAM"))
     {
       seqdb = "UNIPROT";
@@ -930,6 +938,11 @@ public class StockholmFile extends AlignFile
     return annot;
   }
 
+  private String dbref_to_ac_record(DBRefEntry ref)
+  {
+    return ref.getSource().toString() + " ; "
+            + ref.getAccessionId().toString();
+  }
   @Override
   public String print(SequenceI[] s, boolean jvSuffix)
   {
@@ -942,8 +955,10 @@ public class StockholmFile extends AlignFile
     int maxid = 0;
     int in = 0;
     Hashtable dataRef = null;
+    boolean isAA = s[in].isProtein();
     while ((in < s.length) && (s[in] != null))
     {
+
       String tmp = printId(s[in], jvSuffix);
       max = Math.max(max, s[in].getLength());
 
@@ -953,17 +968,33 @@ public class StockholmFile extends AlignFile
       }
       if (s[in].getDBRefs() != null)
       {
-        for (int idb = 0; idb < s[in].getDBRefs().length; idb++)
+        if (dataRef == null)
+        {
+          dataRef = new Hashtable();
+        }
+        List<DBRefEntry> primrefs = s[in].getPrimaryDBRefs();
+        if (primrefs.size() >= 1)
         {
-          if (dataRef == null)
+          dataRef.put(tmp, dbref_to_ac_record(primrefs.get(0)));
+        }
+        else
+        {
+          for (int idb = 0; idb < s[in].getDBRefs().length; idb++)
           {
-            dataRef = new Hashtable();
+            DBRefEntry dbref = s[in].getDBRefs()[idb];
+            dataRef.put(tmp, dbref_to_ac_record(dbref));
+            // if we put in a uniprot or EMBL record then we're done:
+            if (isAA && DBRefSource.UNIPROT
+                    .equals(DBRefUtils.getCanonicalName(dbref.getSource())))
+            {
+              break;
+            }
+            if (!isAA && DBRefSource.EMBL
+                    .equals(DBRefUtils.getCanonicalName(dbref.getSource())))
+            {
+              break;
+            }
           }
-
-          String datAs1 = s[in].getDBRefs()[idb].getSource().toString()
-                  + " ; "
-                  + s[in].getDBRefs()[idb].getAccessionId().toString();
-          dataRef.put(tmp, datAs1);
         }
       }
       in++;
@@ -996,7 +1027,8 @@ public class StockholmFile extends AlignFile
         String type = (String) dataRef.remove(idd);
         out.append(new Format("%-" + (maxid - 2) + "s")
                 .form("#=GS " + idd.toString() + " "));
-        if (type.contains("PFAM") || type.contains("RFAM"))
+        if (isAA && type.contains("UNIPROT")
+                || (!isAA && type.contains("EMBL")))
         {
 
           out.append(" AC " + type.substring(type.indexOf(";") + 1));
index 5afbca5..4d5a025 100755 (executable)
@@ -488,7 +488,7 @@ public class DBRefUtils
       else
       {
         // default:
-        ref = new DBRefEntry(locsrc, version, acn);
+        ref = new DBRefEntry(locsrc, version, acn.trim());
       }
     }
     if (ref != null)
index ba4312a..9fdd7b9 100644 (file)
  */
 package jalview.io;
 
+import static org.testng.Assert.assertTrue;
 import static org.testng.AssertJUnit.assertEquals;
 import static org.testng.AssertJUnit.assertNotNull;
 import static org.testng.AssertJUnit.assertTrue;
 import static org.testng.AssertJUnit.fail;
 
+import jalview.datamodel.Alignment;
 import jalview.datamodel.AlignmentAnnotation;
 import jalview.datamodel.AlignmentI;
 import jalview.datamodel.Annotation;
+import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.Sequence;
 import jalview.datamodel.SequenceFeature;
 import jalview.datamodel.SequenceI;
 import jalview.gui.JvOptionPane;
+import jalview.util.DBRefUtils;
 
 import java.io.File;
 import java.util.Arrays;
@@ -93,14 +98,55 @@ public class StockholmFileTest
   }
 
   /**
+   * JAL-3529 - verify uniprot refs for sequences are output for sequences
+   * retrieved via Pfam
+   */
+  @Test(groups = { "Functional" })
+  public void dbrefOutput() throws Exception
+  {
+    // sequences retrieved in a Pfam domain alignment also have a PFAM database
+    // reference
+    SequenceI sq = new Sequence("FER2_SPIOL", "AASSDDDFFF");
+    sq.addDBRef(new DBRefEntry("UNIPROT", "1", "P00224"));
+    sq.addDBRef(new DBRefEntry("PFAM", "1", "P00224.1"));
+    sq.addDBRef(new DBRefEntry("PFAM", "1", "PF00111"));
+    AppletFormatAdapter af = new AppletFormatAdapter();
+    String toStockholm = af.formatSequences(FileFormat.Stockholm,
+            new Alignment(new SequenceI[]
+            { sq }), false);
+    System.out.println(toStockholm);
+    // bleh - java.util.Regex sucks
+    assertTrue(
+            Pattern.compile(
+                    "^#=GS\\s+FER2_SPIOL(/\\d+-\\d+)?\\s+AC\\s+P00224$",
+                    Pattern.MULTILINE).matcher(toStockholm)
+                    .find(),
+            "Couldn't locate UNIPROT Accession in generated Stockholm file.");
+    AlignmentI fromStockholm = af.readFile(toStockholm,
+            DataSourceType.PASTE, FileFormat.Stockholm);
+    SequenceI importedSeq = fromStockholm.getSequenceAt(0);
+    assertTrue(importedSeq.getDBRefs().length == 1,
+            "Expected just one database reference to be added to sequence.");
+    assertTrue(
+            importedSeq.getDBRefs()[0].getAccessionId().indexOf(" ") == -1,
+            "Spaces were found in accession ID.");
+    List<DBRefEntry> dbrefs = DBRefUtils.searchRefs(importedSeq.getDBRefs(),
+            "P00224");
+    assertTrue(dbrefs.size() == 1,
+            "Couldn't find Uniprot DBRef on re-imported sequence.");
+
+  }
+
+  /**
    * test alignment data in given file can be imported, exported and reimported
    * with no dataloss
    * 
    * @param f
-   *          - source datafile (IdentifyFile.identify() should work with it)
+   *                               - source datafile (IdentifyFile.identify()
+   *                               should work with it)
    * @param ioformat
-   *          - label for IO class used to write and read back in the data from
-   *          f
+   *                               - label for IO class used to write and read
+   *                               back in the data from f
    * @param ignoreFeatures
    * @param ignoreRowVisibility
    * @param allowNullAnnotations
index 0368d1e..ca5bca3 100644 (file)
@@ -135,6 +135,11 @@ public class DBRefUtilsTest
     assertEquals("1.2", ref.getVersion());
     assertEquals("a7890", ref.getAccessionId());
     assertTrue(seq.getAllPDBEntries().isEmpty());
+    SequenceI seq2 = new Sequence("Seq2", "ABCD");
+    // Check that whitespace doesn't confuse parseToDbRef
+    DBRefEntry ref2 = DBRefUtils.parseToDbRef(seq2, "EMBL", "1.2",
+            " a7890");
+    assertEquals(ref, ref2);
   }
 
   /**