after merge
[jalview.git] / src / jalview / io / SequenceFeatureFetcher.java
index bb404d8..d980b0a 100755 (executable)
@@ -22,396 +22,318 @@ import jalview.datamodel.*;
 \r
 import jalview.gui.*;\r
 \r
-import jalview.io.*;\r
-\r
 import java.io.*;\r
 \r
 import java.util.*;\r
 \r
+import org.exolab.castor.mapping.Mapping;\r
 \r
-public class SequenceFeatureFetcher implements Runnable {\r
-    AlignmentI align;\r
-    AlignmentPanel ap;\r
-    ArrayList unknownSequences;\r
-    CutAndPasteTransfer output = new CutAndPasteTransfer();\r
-    StringBuffer sbuffer = new StringBuffer();\r
-\r
-    public SequenceFeatureFetcher(AlignmentI align, AlignmentPanel ap) {\r
-        unknownSequences = new ArrayList();\r
-        this.align = align;\r
-        this.ap = ap;\r
-\r
-        Thread thread = new Thread(this);\r
-        thread.start();\r
-    }\r
-\r
-    public void run() {\r
-        String cache = jalview.bin.Cache.getProperty("UNIPROT_CACHE");\r
-\r
-        RandomAccessFile out = null;\r
-\r
-        try {\r
-            if (cache == null) {\r
-                jalview.bin.Cache.setProperty("UNIPROT_CACHE",\r
-                    System.getProperty("user.home") + "/.jalview.uniprot.xml");\r
-                cache = jalview.bin.Cache.getProperty("UNIPROT_CACHE");\r
-            }\r
-\r
-            File test = new File(cache);\r
-\r
-            if (!test.exists()) {\r
-                out = new RandomAccessFile(cache, "rw");\r
-                out.writeBytes("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");\r
-                out.writeBytes("<UNIPROT_CACHE>\n");\r
-            } else {\r
-                out = new RandomAccessFile(cache, "rw");\r
-\r
-                // open exisiting cache and remove </UNIPROT_CACHE> from the end\r
-                long lastLine = 0;\r
-                String data;\r
-\r
-                while ((data = out.readLine()) != null) {\r
-                    if (data.indexOf("</entry>") > -1) {\r
-                        lastLine = out.getFilePointer();\r
-                    }\r
-                }\r
+import org.exolab.castor.xml.*;\r
+import jalview.analysis.AlignSeq;\r
 \r
-                out.seek(lastLine);\r
-            }\r
-\r
-            int seqIndex = 0;\r
-            Vector sequences = align.getSequences();\r
 \r
-            while (seqIndex < sequences.size()) {\r
-                ArrayList ids = new ArrayList();\r
 \r
-                for (int i = 0; (seqIndex < sequences.size()) && (i < 50);\r
-                        seqIndex++, i++) {\r
-                    SequenceI sequence = (SequenceI) sequences.get(seqIndex);\r
-                    ids.add(sequence.getName());\r
-                }\r
+/**\r
+ * DOCUMENT ME!\r
+ *\r
+ * @author $author$\r
+ * @version $Revision$\r
+ */\r
+public class SequenceFeatureFetcher implements Runnable\r
+{\r
 \r
-                tryLocalCacheFirst(ids, align);\r
+  AlignmentI align;\r
+  AlignmentI dataset;\r
+  AlignmentPanel ap;\r
+  ArrayList unknownSequences;\r
+  CutAndPasteTransfer output = new CutAndPasteTransfer();\r
+  StringBuffer sbuffer = new StringBuffer();\r
+  boolean uniprotFlag = false;\r
 \r
-                if (ids.size() > 0) {\r
-                    StringBuffer remainingIds = new StringBuffer("uniprot:");\r
+  public SequenceFeatureFetcher()\r
+  {}\r
 \r
-                    for (int i = 0; i < ids.size(); i++)\r
-                        remainingIds.append(ids.get(i) + ";");\r
+  public Vector getUniprotEntries(File file)\r
+  {\r
 \r
-                    EBIFetchClient ebi = new EBIFetchClient();\r
-                    String[] result = ebi.fetchData(remainingIds.toString(),\r
-                            "xml", null);\r
+    UniprotFile uni = new UniprotFile();\r
+    try\r
+    {\r
+      // 1. Load the mapping information from the file\r
+      Mapping map = new Mapping(uni.getClass().getClassLoader());\r
+      java.net.URL url = getClass().getResource("/uniprot_mapping.xml");\r
+      map.loadMapping(url);\r
 \r
-                    if (result != null) {\r
-                        ReadUniprotFile(result, out, align);\r
-                    }\r
-                }\r
-            }\r
+      // 2. Unmarshal the data\r
+      Unmarshaller unmar = new Unmarshaller();\r
+      unmar.setIgnoreExtraElements(true);\r
+      unmar.setMapping(map);\r
+      uni = (UniprotFile) unmar.unmarshal(new FileReader(file));\r
 \r
-            if (out != null) {\r
-                out.writeBytes("</UNIPROT_CACHE>\n");\r
-                out.close();\r
+    }\r
+    catch (Exception e)\r
+    {\r
+      System.out.println("Error getUniprotEntries() "+e);\r
+    }\r
+    return uni.getUniprotEntries();\r
+  }\r
+\r
+  /**\r
+   * Creates a new SequenceFeatureFetcher object.\r
+   *\r
+   * @param align DOCUMENT ME!\r
+   * @param ap DOCUMENT ME!\r
+   */\r
+  public SequenceFeatureFetcher(AlignmentI align, AlignmentPanel ap)\r
+  {\r
+    unknownSequences = new ArrayList();\r
+    this.align = align;\r
+    this.dataset = align.getDataset();\r
+    this.ap = ap;\r
+\r
+    Thread thread = new Thread(this);\r
+    thread.start();\r
+  }\r
+\r
+  /**\r
+   * DOCUMENT ME!\r
+   */\r
+  public void run()\r
+  {\r
+    try\r
+    {\r
+      int seqIndex = 0;\r
+      Vector sequences = dataset.getSequences();\r
+\r
+      while (seqIndex < sequences.size())\r
+      {\r
+        Vector ids = new Vector();\r
+\r
+        for (int i = 0; (seqIndex < sequences.size()) && (i < 50);\r
+             seqIndex++, i++)\r
+        {\r
+          Sequence sequence = (Sequence) sequences.get(seqIndex);\r
+          if(sequence.getSequenceFeatures()==null)\r
+          {\r
+            if (!ids.contains(sequence.getName()))\r
+            {\r
+              ids.add(sequence.getName());\r
+              unknownSequences.add(sequence);\r
             }\r
-        } catch (Exception ex) {\r
-            ex.printStackTrace();\r
-        }\r
-\r
-        jalview.gui.PaintRefresher.Refresh(null, align);\r
-        findMissingIds(align);\r
-\r
-        if (sbuffer.length() > 0) {\r
-            output.setText(\r
-                "Your sequences have been matched to Uniprot. Some of the ids have been\n" +\r
-                "altered, most likely the start/end residue will have been updated.\n" +\r
-                "Save your alignment to maintain the updated id.\n\n" +\r
-                sbuffer.toString());\r
-            Desktop.addInternalFrame(output, "Sequence names updated ", 600, 300);\r
+          }\r
         }\r
 \r
-        if (unknownSequences.size() > 0) {\r
-            //ignore for now!!!!!!!!!!\r
-            //  WSWUBlastClient blastClient = new WSWUBlastClient(align, unknownSequences);\r
+        ///////////////////////////////////\r
+        ///READ FROM EBI\r
+        if (ids.size() > 0)\r
+        {\r
+          StringBuffer remainingIds = new StringBuffer("uniprot:");\r
+          for (int i = 0; i < ids.size(); i++)\r
+           {\r
+             if(ids.get(i).toString().indexOf("|")>-1)\r
+             {\r
+               remainingIds.append(ids.get(i).toString().substring(\r
+                   ids.get(i).toString().lastIndexOf("|") + 1));\r
+               uniprotFlag = true;\r
+             }\r
+             remainingIds.append(ids.get(i) + ";");\r
+           }\r
+          EBIFetchClient ebi = new EBIFetchClient();\r
+          File file = ebi.fetchDataAsFile(remainingIds.toString(),\r
+                                          "xml", "raw");\r
+\r
+\r
+\r
+          if (file != null)\r
+          {\r
+            ReadUniprotFile(file, ids);\r
+          }\r
         }\r
+      }\r
     }\r
-\r
-    void ReadUniprotFile(String[] result, RandomAccessFile out, AlignmentI align) {\r
-        SequenceI sequence = null;\r
-        Vector features = null;\r
-        String type;\r
-        String description;\r
-        String status;\r
-        String start;\r
-        String end;\r
-        String pdb = null;\r
-\r
-        for (int r = 0; r < result.length; r++) {\r
-            if ((sequence == null) && (result[r].indexOf("<name>") > -1)) {\r
-                long filePointer = 0;\r
-\r
-                if (out != null) {\r
-                    try {\r
-                        filePointer = out.getFilePointer();\r
-                        out.writeBytes("<entry>\n");\r
-                    } catch (Exception ex) {\r
-                    }\r
-                }\r
-\r
-                String seqName = parseElement(result[r], "<name>", out);\r
-                sequence = align.findName(seqName);\r
-\r
-                if (sequence == null) {\r
-                    sequence = align.findName(seqName.substring(0,\r
-                                seqName.indexOf('_')));\r
-\r
-                    if (sequence != null) {\r
-                        sbuffer.append("changing " + sequence.getName() +\r
-                            " to " + seqName + "\n");\r
-                        sequence.setName(seqName);\r
-                    }\r
-                }\r
-\r
-                if (sequence == null) {\r
-                    sbuffer.append("UNIPROT updated suggestion is " +\r
-                        result[r] + "\n");\r
-                    sequence = align.findName(result[r]);\r
-\r
-                    // this entry has been suggested by ebi.\r
-                    // doesn't match id in alignment file\r
-                    try {\r
-                        out.setLength(filePointer);\r
-                    } catch (Exception ex) {\r
-                    }\r
-\r
-                    // now skip to next entry\r
-                    while (result[r].indexOf("</entry>") == -1)\r
-                        r++;\r
-                }\r
-\r
-                features = new Vector();\r
-                type = "";\r
-                start = "0";\r
-                end = "0";\r
-                description = "";\r
-                status = "";\r
-                pdb = "";\r
-            }\r
-\r
-            if (sequence == null) {\r
-                continue;\r
-            }\r
-\r
-            if (result[r].indexOf("<property type=\"pdb accession\"") > -1) {\r
-                pdb = parseValue(result[r], "value=", out);\r
-                sequence.setPDBId(pdb);\r
-            }\r
-\r
-            if (result[r].indexOf("feature type") > -1) {\r
-                type = parseValue(result[r], "type=", out);\r
-                description = parseValue(result[r], "description=", null);\r
-                status = parseValue(result[r], "status=", null);\r
-\r
-                while (result[r].indexOf("position") == -1) {\r
-                    r++; //<location>\r
-                }\r
-\r
-                // r++;\r
-                if (result[r].indexOf("begin") > -1) {\r
-                    start = parseValue(result[r], "position=", out);\r
-                    end = parseValue(result[++r], "position=", out);\r
-                } else {\r
-                    start = parseValue(result[r], "position=", out);\r
-                    end = parseValue(result[r], "position=", null);\r
-                }\r
-\r
-                int sstart = Integer.parseInt(start);\r
-                int eend = Integer.parseInt(end);\r
-\r
-                if (out != null) {\r
-                    try {\r
-                        out.writeBytes("</feature>\n");\r
-                    } catch (Exception ex) {\r
-                    }\r
-                }\r
-\r
-                SequenceFeature sf = new SequenceFeature(type, sstart, eend,\r
-                        description, status);\r
-                features.add(sf);\r
-            }\r
-\r
-            if (result[r].indexOf("<sequence") > -1) {\r
-                StringBuffer seqString = new StringBuffer();\r
-\r
-                if (out != null) {\r
-                    try {\r
-                        out.writeBytes(result[r] + "\n");\r
-                    } catch (Exception ex) {\r
-                    }\r
-                }\r
-\r
-                while (result[++r].indexOf("</sequence>") == -1) {\r
-                    seqString.append(result[r]);\r
-\r
-                    if (out != null) {\r
-                        try {\r
-                            out.writeBytes(result[r] + "\n");\r
-                        } catch (Exception ex) {\r
-                        }\r
-                    }\r
-                }\r
-\r
-                if (out != null) {\r
-                    try {\r
-                        out.writeBytes(result[r] + "\n");\r
-                    } catch (Exception ex) {\r
-                    }\r
-                }\r
-\r
-                StringBuffer nonGapped = new StringBuffer();\r
-\r
-                for (int i = 0; i < sequence.getSequence().length(); i++) {\r
-                    if (!jalview.util.Comparison.isGap(sequence.getCharAt(i))) {\r
-                        nonGapped.append(sequence.getCharAt(i));\r
-                    }\r
-                }\r
-\r
-                int absStart = seqString.toString().indexOf(nonGapped.toString());\r
-\r
-                if (absStart == -1) {\r
-                    unknownSequences.add(sequence.getName());\r
-                    features = null;\r
-                    sbuffer.append(sequence.getName() +\r
-                        " SEQUENCE NOT %100 MATCH \n");\r
-\r
-                    continue;\r
-                }\r
-\r
-                int absEnd = absStart + nonGapped.toString().length();\r
-                absStart += 1;\r
-\r
-                if ((absStart != sequence.getStart()) ||\r
-                        (absEnd != sequence.getEnd())) {\r
-                    sbuffer.append("Updated: " + sequence.getName() + " " +\r
-                        sequence.getStart() + "/" + sequence.getEnd() +\r
-                        "  to  " + absStart + "/" + absEnd + "\n");\r
-                }\r
-\r
-                sequence.setStart(absStart);\r
-                sequence.setEnd(absEnd);\r
-            }\r
-\r
-            if (result[r].indexOf("</entry>") > -1) {\r
-                if (features != null) {\r
-                    sequence.setSequenceFeatures(features);\r
-                }\r
-\r
-                features = null;\r
-                sequence = null;\r
-\r
-                if (out != null) {\r
-                    try {\r
-                        out.writeBytes("</entry>\n");\r
-                    } catch (Exception ex) {\r
-                    }\r
-                }\r
-            }\r
-        }\r
+    catch (Exception ex)\r
+    {\r
+      ex.printStackTrace();\r
     }\r
 \r
-    void findMissingIds(AlignmentI align) {\r
-        String data;\r
-        ArrayList cachedIds = new ArrayList();\r
-\r
-        try {\r
-            BufferedReader in = new BufferedReader(new FileReader(\r
-                        jalview.bin.Cache.getProperty("UNIPROT_CACHE")));\r
-\r
-            while ((data = in.readLine()) != null) {\r
-                if (data.indexOf("name") > -1) {\r
-                    String name = parseElement(data, "<name>", null);\r
-                    cachedIds.add(name);\r
-                }\r
-            }\r
-        } catch (Exception ex) {\r
-            ex.printStackTrace();\r
-        }\r
+    if (sbuffer.length() > 0)\r
+    {\r
+      output.setText(\r
+          "Your sequences have been matched to Uniprot. Some of the ids have been\n" +\r
+          "altered, most likely the start/end residue will have been updated.\n" +\r
+          "Save your alignment to maintain the updated id.\n\n" +\r
+          sbuffer.toString());\r
+      Desktop.addInternalFrame(output, "Sequence names updated ", 600, 300);\r
+      // The above is the dataset, we must now find out the index\r
+      // of the viewed sequence\r
 \r
-        for (int i = 0; i < align.getHeight(); i++)\r
-            if (!cachedIds.contains(align.getSequenceAt(i).getName())) {\r
-                unknownSequences.add(align.getSequenceAt(i).getName());\r
-            }\r
     }\r
 \r
-    void tryLocalCacheFirst(ArrayList ids, AlignmentI align) {\r
-        ArrayList cacheData = new ArrayList();\r
-\r
-        try {\r
-            BufferedReader in = new BufferedReader(new FileReader(\r
-                        jalview.bin.Cache.getProperty("UNIPROT_CACHE")));\r
-\r
-            // read through cache file, if the cache has sequences we're looking for\r
-            // add the lines to a new String array, Readthis new array and\r
-            // make sure we remove the ids from the list to retrieve from EBI\r
-            String data;\r
-\r
-            while ((data = in.readLine()) != null) {\r
-                if (data.indexOf("name") > -1) {\r
-                    String name = parseElement(data, "<name>", null);\r
-\r
-                    if (ids.contains(name)) {\r
-                        cacheData.add("<entry>");\r
-                        cacheData.add(data);\r
-\r
-                        while (data.indexOf("</entry>") == -1) {\r
-                            data = in.readLine();\r
-                            cacheData.add(data);\r
-                        }\r
-\r
-                        cacheData.add(data);\r
-\r
-                        ids.remove(name);\r
-                    }\r
-                }\r
-            }\r
-        } catch (Exception ex) {\r
-            ex.printStackTrace();\r
-        }\r
-\r
-        String[] localData = new String[cacheData.size()];\r
-        cacheData.toArray(localData);\r
-\r
-        if ((localData != null) && (localData.length > 0)) {\r
-            ReadUniprotFile(localData, null, align);\r
+    promptBeforeBlast();\r
+\r
+  }\r
+\r
+\r
+  void promptBeforeBlast()\r
+   {\r
+     // This must be outside the run() body as java 1.5\r
+     // will not return any value from the OptionPane to the expired thread.\r
+      if (unknownSequences.size() > 0)\r
+      {\r
+        int reply = javax.swing.JOptionPane.showConfirmDialog(\r
+            Desktop.desktop, "Couldn't find a match for "+unknownSequences.size()+" sequences."\r
+                +"\nPerform blast for unknown sequences?",\r
+                    "Blast for Unidentified Sequences",\r
+                     javax.swing.JOptionPane.YES_NO_OPTION, javax.swing.JOptionPane.QUESTION_MESSAGE);\r
+\r
+        if(reply == javax.swing.JOptionPane.YES_OPTION)\r
+         new WSWUBlastClient(ap, align, unknownSequences);\r
+      }\r
+\r
+\r
+    ap.repaint();\r
+  }\r
+\r
+  /**\r
+   * DOCUMENT ME!\r
+   *\r
+   * @param result DOCUMENT ME!\r
+   * @param out DOCUMENT ME!\r
+   * @param align DOCUMENT ME!\r
+   */\r
+  void ReadUniprotFile(File file, Vector ids)\r
+  {\r
+    if(!file.exists())\r
+      return;\r
+\r
+    SequenceI sequence = null;\r
+\r
+    Vector entries = getUniprotEntries(file);\r
+\r
+    int i, iSize = entries==null?0:entries.size();\r
+    UniprotEntry entry;\r
+    for (i = 0; i < iSize; i++)\r
+    {\r
+      entry = (UniprotEntry) entries.elementAt(i);\r
+      String idmatch = entry.getAccession().elementAt(0).toString();\r
+      sequence = dataset.findName(idmatch);\r
+\r
+      if (sequence == null)\r
+      {\r
+        //Sequence maybe Name, not Accession\r
+        idmatch = entry.getName().elementAt(0).toString();\r
+        sequence = dataset.findName(idmatch);\r
+      }\r
+\r
+      if(sequence!=null)\r
+        ids.remove(sequence.getName());\r
+\r
+      else  if (sequence == null && uniprotFlag)\r
+      {\r
+          sequence = dataset.findName("UniProt/Swiss-Prot|"+entry.getAccession().elementAt(0)+"|"+idmatch);\r
+          ids.remove(idmatch);\r
+      }\r
+\r
+      if(sequence ==null)\r
+      {\r
+        System.out.println(idmatch+" not found");\r
+        continue;\r
+      }\r
+\r
+      unknownSequences.remove(sequence);\r
+\r
+      String nonGapped = AlignSeq.extractGaps("-. ", sequence.getSequence());\r
+\r
+      int absStart = entry.getUniprotSequence().getContent().indexOf(\r
+          nonGapped.toString());\r
+\r
+      if (absStart == -1)\r
+      {\r
+        // Is UniprotSequence contained in dataset sequence?\r
+        absStart = nonGapped.toString().indexOf(entry.getUniprotSequence().getContent());\r
+        if(absStart == -1)\r
+        {\r
+          unknownSequences.add(sequence.getName());\r
+          sbuffer.append(sequence.getName() +\r
+                         " SEQUENCE NOT %100 MATCH \n");\r
+\r
+          continue;\r
         }\r
-    }\r
-\r
-    String parseValue(String line, String tag, RandomAccessFile out) {\r
-        if (out != null) {\r
-            try {\r
-                out.writeBytes(line + "\n");\r
-            } catch (Exception ex) {\r
+        else\r
+        {\r
+          if(entry.getFeature()!=null)\r
+          {\r
+            Enumeration e = entry.getFeature().elements();\r
+            while (e.hasMoreElements())\r
+            {\r
+              SequenceFeature sf = (SequenceFeature) e.nextElement();\r
+              sf.setBegin(sf.getBegin() + absStart + 1);\r
+              sf.setEnd(sf.getEnd() + absStart + 1);\r
             }\r
-        }\r
+          }\r
 \r
-        int index = line.indexOf(tag) + tag.length() + 1;\r
-\r
-        if (index == tag.length()) {\r
-            return "";\r
+          sbuffer.append(sequence.getName() +\r
+                         " HAS "+absStart+" PREFIXED RESIDUES COMPARED TO UNIPROT - ANY SEQUENCE FEATURES"\r
+                        +" HAVE BEEN ADJUSTED ACCORDINGLY \n");\r
+          absStart = 0;\r
         }\r
 \r
-        return line.substring(index, line.indexOf("\"", index + 1));\r
-    }\r
-\r
-    String parseElement(String line, String tag, RandomAccessFile out) {\r
-        if (out != null) {\r
-            try {\r
-                out.writeBytes(line + "\n");\r
-            } catch (Exception ex) {\r
-            }\r
+      }\r
+\r
+      int absEnd = absStart + nonGapped.toString().length();\r
+      absStart += 1;\r
+\r
+      Enumeration e = entry.getDbReference().elements();\r
+      Vector onlyPdbEntries = new Vector();\r
+      while(e.hasMoreElements())\r
+      {\r
+        PDBEntry pdb = (PDBEntry)e.nextElement();\r
+        if(!pdb.getType().equals("PDB"))\r
+          continue;\r
+\r
+        onlyPdbEntries.addElement(pdb);\r
+      }\r
+\r
+      sequence.setPDBId(onlyPdbEntries);\r
+      sequence.setSequenceFeatures(entry.getFeature());\r
+      sequence.setStart(absStart);\r
+      sequence.setEnd(absEnd);\r
+\r
+\r
+      int n = 0;\r
+      SequenceI seq2;\r
+      while (n < align.getHeight())\r
+      {\r
+        //This loop enables multiple sequences with the same\r
+        //id to have features added and seq limits updated\r
+        seq2 = align.getSequenceAt(n);\r
+        if (seq2.getName().equals(idmatch))\r
+        {\r
+\r
+          nonGapped = AlignSeq.extractGaps("-. ", seq2.getSequence());\r
+\r
+          absStart = sequence.getSequence().indexOf(nonGapped);\r
+          absEnd = absStart + nonGapped.toString().length() - 1;\r
+\r
+          // This is the Viewd alignment sequences\r
+          // No need to tell the user of the dataset updates\r
+          if ( (seq2.getStart() != absStart+sequence.getStart())\r
+             || (seq2.getEnd() != absEnd+sequence.getStart()))\r
+          {\r
+            sbuffer.append("Updated: " + seq2.getName() + " " +\r
+                           seq2.getStart() + "/" + seq2.getEnd() +\r
+                           "  to  " + (absStart + sequence.getStart()) + "/" +\r
+                           (absEnd + sequence.getStart()) + "\n");\r
+\r
+            seq2.setStart(absStart + sequence.getStart());\r
+            seq2.setEnd(absEnd + sequence.getStart());\r
+          }\r
         }\r
 \r
-        int index = line.indexOf(tag) + tag.length();\r
-\r
-        return line.substring(index, line.indexOf("</"));\r
+        n++;\r
+      }\r
     }\r
+  }\r
 }\r
+\r
+\r