Querystring made from line tokenized seq name, seq refs held with what was sent to...
authoramwaterhouse <Andrew Waterhouse>
Mon, 13 Nov 2006 14:20:02 +0000 (14:20 +0000)
committeramwaterhouse <Andrew Waterhouse>
Mon, 13 Nov 2006 14:20:02 +0000 (14:20 +0000)
src/jalview/io/DBRefFetcher.java

index d57f3be..d26ef6d 100644 (file)
@@ -41,15 +41,16 @@ import jalview.analysis.AlignSeq;
  */\r
 public class DBRefFetcher implements Runnable\r
 {\r
-  AlignmentI align;\r
   AlignmentI dataset;\r
   AlignFrame af;\r
-  ArrayList unknownSequences;\r
   CutAndPasteTransfer output = new CutAndPasteTransfer();\r
   StringBuffer sbuffer = new StringBuffer();\r
-  boolean uniprotFlag = false;\r
   boolean running = false;\r
 \r
+  ///This will be a collection of Vectors of sequenceI refs.\r
+  //The key will be the seq name or accession id of the seq\r
+  Hashtable seqRefs;\r
+\r
   public DBRefFetcher()\r
   {}\r
 \r
@@ -67,7 +68,6 @@ public class DBRefFetcher implements Runnable
       Unmarshaller unmar = new Unmarshaller(uni);\r
       unmar.setIgnoreExtraElements(true);\r
       unmar.setMapping(map);\r
-   //   unmar.setDebug(true);\r
 \r
       uni = (UniprotFile) unmar.unmarshal(new FileReader(file));\r
     }\r
@@ -89,8 +89,6 @@ public class DBRefFetcher implements Runnable
   public DBRefFetcher(AlignmentI align, AlignFrame af)\r
   {\r
     this.af = af;\r
-    unknownSequences = new ArrayList();\r
-    this.align = align;\r
     this.dataset = align.getDataset();\r
   }\r
 \r
@@ -114,6 +112,42 @@ public class DBRefFetcher implements Runnable
   }\r
 \r
   /**\r
+   * The sequence will be added to a vector of sequences\r
+   * belonging to key which could be either seq name or dbref id\r
+   * @param seq SequenceI\r
+   * @param key String\r
+   */\r
+  void addSeqId(SequenceI seq, String key)\r
+  {\r
+    key = key.toUpperCase();\r
+\r
+    Vector seqs;\r
+    if(seqRefs.containsKey(key))\r
+    {\r
+      seqs = (Vector)seqRefs.get(key);\r
+\r
+      if(seqs!=null && !seqs.contains(seq))\r
+      {\r
+        seqs.addElement(seq);\r
+      }\r
+      else if(seqs==null)\r
+      {\r
+        seqs = new Vector();\r
+        seqs.addElement(seq);\r
+      }\r
+\r
+    }\r
+    else\r
+    {\r
+      seqs = new Vector();\r
+      seqs.addElement(seq);\r
+    }\r
+\r
+    seqRefs.put(key, seqs);\r
+  }\r
+\r
+\r
+  /**\r
    * DOCUMENT ME!\r
    */\r
   public void run()\r
@@ -122,6 +156,8 @@ public class DBRefFetcher implements Runnable
     af.setProgressBar("Fetching db refs", startTime);\r
     running = true;\r
 \r
+    seqRefs = new Hashtable();\r
+\r
     try\r
     {\r
       int seqIndex = 0;\r
@@ -129,7 +165,7 @@ public class DBRefFetcher implements Runnable
 \r
       while (seqIndex < sequences.size())\r
       {\r
-        Vector ids = new Vector();\r
+        StringBuffer queryString = new StringBuffer("uniprot:");\r
 \r
         for (int i = 0; (seqIndex < sequences.size()) && (i < 50);\r
              seqIndex++, i++)\r
@@ -139,48 +175,42 @@ public class DBRefFetcher implements Runnable
               jalview.datamodel.DBRefSource.UNIPROT});\r
           if (uprefs!=null)\r
           {\r
-            // we know the id for this entry, so don't note its ID in the unknownSequences list\r
-            for (int j=0,k=uprefs.length; j<k; j++)\r
-              ids.add(uprefs[j].getAccessionId());\r
-            unknownSequences.add(sequence);\r
-          } else {\r
-            if (!ids.contains(sequence.getName()))\r
+            if(uprefs.length+i>50)\r
+              break;\r
+\r
+            for(int j=0; j<uprefs.length; j++)\r
+            {\r
+              addSeqId(sequence, uprefs[j].getAccessionId());\r
+              queryString.append(uprefs[i].getAccessionId()+";");\r
+            }\r
+          }\r
+          else\r
+          {\r
+            StringTokenizer st = new StringTokenizer(sequence.getName(), "|");\r
+            if(st.countTokens()+i>50)\r
+            {\r
+              //Dont send more than 50 id strings to dbFetch!!\r
+              seqIndex --;\r
+            }\r
+            else\r
             {\r
-              ids.add(sequence.getName());\r
-              unknownSequences.add(sequence);\r
+                while(st.hasMoreTokens())\r
+                {\r
+                  String token = st.nextToken();\r
+                  addSeqId(sequence, token);\r
+                  queryString.append(token+";");\r
+                }\r
             }\r
           }\r
         }\r
 \r
         ///////////////////////////////////\r
         ///READ FROM EBI\r
-        if (ids.size() > 0)\r
+        EBIFetchClient ebi = new EBIFetchClient();\r
+        File file = ebi.fetchDataAsFile(queryString.toString(), "xml", "raw");\r
+        if (file != null)\r
         {\r
-          StringBuffer remainingIds = new StringBuffer("uniprot:");\r
-          for (int i = 0; i < ids.size(); i++)\r
-           {\r
-             if(ids.get(i).toString().indexOf("|")>-1)\r
-             {\r
-               remainingIds.append(ids.get(i).toString().substring(\r
-                   ids.get(i).toString().lastIndexOf("|") + 1));\r
-               uniprotFlag = true;\r
-             }\r
-             else\r
-               remainingIds.append(ids.get(i));\r
-\r
-             remainingIds.append(";");\r
-           }\r
-\r
-          EBIFetchClient ebi = new EBIFetchClient();\r
-          File file = ebi.fetchDataAsFile(remainingIds.toString(),\r
-                                          "xml", "raw");\r
-\r
-\r
-\r
-          if (file != null)\r
-          {\r
-            ReadUniprotFile(file, ids);\r
-          }\r
+          ReadUniprotFile(file);\r
         }\r
       }\r
     }\r
@@ -210,28 +240,6 @@ public class DBRefFetcher implements Runnable
   }\r
 \r
 \r
-  void promptBeforeBlast()\r
-   {\r
-     // This must be outside the run() body as java 1.5\r
-     // will not return any value from the OptionPane to the expired thread.\r
-      if (unknownSequences.size() > 0)\r
-      {\r
-       // int reply = javax.swing.JOptionPane.showConfirmDialog(\r
-       //     Desktop.desktop, "Couldn't find a match for "+unknownSequences.size()+" sequences."\r
-        //        +"\nPerform blast for unknown sequences?",\r
-        //            "Blast for Unidentified Sequences",\r
-        //             javax.swing.JOptionPane.YES_NO_OPTION, javax.swing.JOptionPane.QUESTION_MESSAGE);\r
-     javax.swing.JOptionPane.showMessageDialog(\r
-    Desktop.desktop, "Couldn't find a match for "+unknownSequences.size()+" sequences.",\r
-            "Unidentified Sequences",\r
-             javax.swing.JOptionPane.WARNING_MESSAGE);\r
-\r
-\r
-      //  if(reply == javax.swing.JOptionPane.YES_OPTION)\r
-     //    new WSWUBlastClient(ap, align, unknownSequences);\r
-      }\r
-  }\r
-\r
   /**\r
    * DOCUMENT ME!\r
    *\r
@@ -239,120 +247,124 @@ public class DBRefFetcher implements Runnable
    * @param out DOCUMENT ME!\r
    * @param align DOCUMENT ME!\r
    */\r
-  void ReadUniprotFile(File file, Vector ids)\r
+  void ReadUniprotFile(File file)\r
   {\r
-    if(!file.exists())\r
+    if (!file.exists())\r
       return;\r
 \r
-    SequenceI [] sequence = null;\r
+    SequenceI sequence = null;\r
 \r
     Vector entries = getUniprotEntries(file);\r
 \r
-    int i, iSize = entries==null?0:entries.size();\r
+    int i, iSize = entries == null ? 0 : entries.size();\r
     UniprotEntry entry;\r
     for (i = 0; i < iSize; i++)\r
     {\r
       entry = (UniprotEntry) entries.elementAt(i);\r
-      String idmatch = entry.getAccession().elementAt(0).toString();\r
-      sequence = dataset.findSequenceMatch(idmatch);\r
 \r
-      if (sequence.length==0)\r
+      //Work out which sequences this Uniprot file has matches to,\r
+      //taking into account all accessionIds and names in the file\r
+      Vector sequenceMatches = new Vector();\r
+      for (int j = 0; j < entry.getAccession().size(); j++)\r
       {\r
-        //Sequence maybe Name, not Accession\r
-        idmatch = entry.getName().elementAt(0).toString();\r
-        sequence = dataset.findSequenceMatch(idmatch);\r
-      }\r
-\r
-      if(sequence.length>0)\r
-        ids.remove(sequence[0].getName());\r
-\r
-      else  if (sequence.length==0 && uniprotFlag)\r
-      {\r
-          StringBuffer upid = new StringBuffer("UniProt/Swiss-Prot|");\r
-          for(int u=0; u<entry.getAccession().size(); u++)\r
-            upid.append(entry.getAccession().elementAt(u)+"|");\r
-\r
-          sequence = dataset.findSequenceMatch(upid+idmatch);\r
-          ids.remove(idmatch);\r
+        String accessionId = entry.getAccession().elementAt(j).toString();\r
+        if (seqRefs.containsKey(accessionId))\r
+        {\r
+          Vector seqs = (Vector) seqRefs.get(accessionId);\r
+          for (int jj = 0; jj < seqs.size(); jj++)\r
+          {\r
+            sequence = (SequenceI) seqs.elementAt(jj);\r
+            if (!sequenceMatches.contains(sequence))\r
+              sequenceMatches.addElement(sequence);\r
+          }\r
+        }\r
       }\r
-\r
-      if(sequence.length==0)\r
+      for (int j = 0; j < entry.getName().size(); j++)\r
       {\r
-        System.out.println(idmatch+" not found");\r
-        continue;\r
+        String name = entry.getName().elementAt(j).toString();\r
+        if (seqRefs.containsKey(name))\r
+        {\r
+          Vector seqs = (Vector) seqRefs.get(name);\r
+          for (int jj = 0; jj < seqs.size(); jj++)\r
+          {\r
+            sequence = (SequenceI) seqs.elementAt(jj);\r
+            if (!sequenceMatches.contains(sequence))\r
+              sequenceMatches.addElement(sequence);\r
+          }\r
+        }\r
       }\r
 \r
-      for(int m=0; m<sequence.length; m++)\r
+      for (int m = 0; m < sequenceMatches.size(); m++)\r
       {\r
+        sequence = (SequenceI) sequenceMatches.elementAt(m);\r
+        sequence.addDBRef(new DBRefEntry(DBRefSource.UNIPROT,\r
+                                         "0",\r
+                                         entry.getAccession().elementAt(0).\r
+                                         toString()));\r
 \r
-      sequence[m].addDBRef(new DBRefEntry(DBRefSource.UNIPROT,\r
-                                       "0",\r
-                                       entry.getAccession().elementAt(0).toString()));\r
+        System.out.println("Adding dbref to " + sequence.getName() + " : " +\r
+                           entry.getAccession().elementAt(0).toString());\r
 \r
-      System.out.println("Adding dbref to "+sequence[m].getName()+" : "+\r
-                         entry.getAccession().elementAt(0).toString());\r
+        String nonGapped = AlignSeq.extractGaps("-. ", sequence.getSequence()).\r
+            toUpperCase();\r
 \r
-      String nonGapped = AlignSeq.extractGaps("-. ", sequence[m].getSequence()).toUpperCase();\r
+        int absStart = entry.getUniprotSequence().getContent().indexOf(\r
+            nonGapped.toString());\r
 \r
-      int absStart = entry.getUniprotSequence().getContent().indexOf(\r
-          nonGapped.toString());\r
-\r
-      if (absStart == -1)\r
-      {\r
-        // Is UniprotSequence contained in dataset sequence?\r
-        absStart = nonGapped.toString().indexOf(entry.getUniprotSequence().getContent());\r
-        if(absStart == -1)\r
+        if (absStart == -1)\r
         {\r
-          sbuffer.append(sequence[m].getName() +\r
-                         " SEQUENCE NOT %100 MATCH \n");\r
-\r
-          continue;\r
-        }\r
-\r
-        if (entry.getFeature() != null)\r
-        {\r
-          Enumeration e = entry.getFeature().elements();\r
-          while (e.hasMoreElements())\r
+          // Is UniprotSequence contained in dataset sequence?\r
+          absStart = nonGapped.toString().indexOf(entry.getUniprotSequence().\r
+                                                  getContent());\r
+          if (absStart == -1)\r
           {\r
-            SequenceFeature sf = (SequenceFeature) e.nextElement();\r
-            sf.setBegin(sf.getBegin() + absStart + 1);\r
-            sf.setEnd(sf.getEnd() + absStart + 1);\r
+            sbuffer.append(sequence.getName() + " SEQUENCE NOT %100 MATCH \n");\r
+            continue;\r
           }\r
 \r
-          sbuffer.append(sequence[m].getName() +\r
-                         " HAS " + absStart +\r
-              " PREFIXED RESIDUES COMPARED TO UNIPROT - ANY SEQUENCE FEATURES"\r
-                         + " HAVE BEEN ADJUSTED ACCORDINGLY \n");\r
-          absStart = 0;\r
-        }\r
+          if (entry.getFeature() != null)\r
+          {\r
+            Enumeration e = entry.getFeature().elements();\r
+            while (e.hasMoreElements())\r
+            {\r
+              SequenceFeature sf = (SequenceFeature) e.nextElement();\r
+              sf.setBegin(sf.getBegin() + absStart + 1);\r
+              sf.setEnd(sf.getEnd() + absStart + 1);\r
+            }\r
 \r
-      }\r
+            sbuffer.append(sequence.getName() +\r
+                           " HAS " + absStart +\r
+                           " PREFIXED RESIDUES COMPARED TO UNIPROT - ANY SEQUENCE FEATURES"\r
+                           + " HAVE BEEN ADJUSTED ACCORDINGLY \n");\r
+            absStart = 0;\r
+          }\r
 \r
-      unknownSequences.remove(sequence);\r
+        }\r
 \r
-      int absEnd = absStart + nonGapped.toString().length();\r
-      absStart += 1;\r
+        //unknownSequences.remove(sequence);\r
 \r
-      Enumeration e = entry.getDbReference().elements();\r
-      Vector onlyPdbEntries = new Vector();\r
-      while(e.hasMoreElements())\r
-      {\r
-        PDBEntry pdb = (PDBEntry)e.nextElement();\r
-        if(!pdb.getType().equals(DBRefSource.PDB))\r
-          continue;\r
+        int absEnd = absStart + nonGapped.toString().length();\r
+        absStart += 1;\r
 \r
-        sequence[m].addDBRef(new DBRefEntry(DBRefSource.PDB,\r
-                                       "0",\r
-                                       pdb.getId()));\r
+        Enumeration e = entry.getDbReference().elements();\r
+        Vector onlyPdbEntries = new Vector();\r
+        while (e.hasMoreElements())\r
+        {\r
+          PDBEntry pdb = (PDBEntry) e.nextElement();\r
+          if (!pdb.getType().equals(DBRefSource.PDB))\r
+            continue;\r
 \r
-        onlyPdbEntries.addElement(pdb);\r
-      }\r
+          sequence.addDBRef(new DBRefEntry(DBRefSource.PDB,\r
+                                           "0",\r
+                                           pdb.getId()));\r
 \r
-      sequence[m].setPDBId(onlyPdbEntries);\r
+          onlyPdbEntries.addElement(pdb);\r
+        }\r
 \r
-      sequence[m].setStart(absStart);\r
-      sequence[m].setEnd(absEnd);\r
+        sequence.setPDBId(onlyPdbEntries);\r
 \r
+        sequence.setStart(absStart);\r
+        sequence.setEnd(absEnd);\r
 \r
       }\r
     }\r