src/jalview/analysis/SequenceIdMatcher.java

   1 /*\r
   2  * Jalview - A Sequence Alignment Editor and Viewer\r
   3  * Copyright (C) 2005 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle\r
   4  *\r
   5  * This program is free software; you can redistribute it and/or\r
   6  * modify it under the terms of the GNU General Public License\r
   7  * as published by the Free Software Foundation; either version 2\r
   8  * of the License, or (at your option) any later version.\r
   9  *\r
  10  * This program is distributed in the hope that it will be useful,\r
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of\r
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r
  13  * GNU General Public License for more details.\r
  14  *\r
  15  * You should have received a copy of the GNU General Public License\r
  16  * along with this program; if not, write to the Free Software\r
  17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA\r
  18  */\r
  19 package jalview.analysis;\r
  20 \r
  21 import java.util.*;\r
  22 \r
  23 import jalview.datamodel.*;\r
  24 \r
  25 /**\r
  26  * <p>Title: </p>\r
  27  * SequenceIdMatcher\r
  28  * <p>Description: </p>\r
  29  * Routine which does approximate Sequence Id resolution by name using\r
  30  * string containment (on word boundaries) rather than equivalence\r
  31  * <p>Copyright: Copyright (c) 2004</p>\r
  32  *\r
  33  * <p>Company: Dundee University</p>\r
  34  *\r
  35  * @author not attributable\r
  36  * @version 1.0\r
  37  */\r
  38 public class SequenceIdMatcher\r
  39 {\r
  40   private Hashtable names;\r
  41 \r
  42   public SequenceIdMatcher(SequenceI[] seqs)\r
  43   {\r
  44     names = new Hashtable();\r
  45     for (int i = 0; i < seqs.length; i++)\r
  46     {\r
  47       names.put(new SeqIdName(seqs[i].getName()), seqs[i]);\r
  48     }\r
  49   }\r
  50 \r
  51   SequenceI findIdMatch(SequenceI seq)\r
  52   {\r
  53     SeqIdName nam = new SeqIdName(seq.getName());\r
  54 \r
  55     if (names.containsKey(nam))\r
  56     {\r
  57       return (SequenceI) names.get(nam);\r
  58     }\r
  59 \r
  60     return null;\r
  61   }\r
  62 \r
  63   SequenceI findIdMatch(String seqnam)\r
  64   {\r
  65     SeqIdName nam = new SeqIdName(seqnam);\r
  66 \r
  67     if (names.containsKey(nam))\r
  68     {\r
  69       return (SequenceI) names.get(nam);\r
  70     }\r
  71 \r
  72     return null;\r
  73   }\r
  74 \r
  75   /**\r
  76    * findIdMatch\r
  77    *\r
  78    * Return pointers to sequences (or sequence object containers)\r
  79    * which have same Id as a given set of different sequence objects\r
  80    *\r
  81    * @param seqs SequenceI[]\r
  82    * @return SequenceI[]\r
  83    */\r
  84   SequenceI[] findIdMatch(SequenceI[] seqs)\r
  85   {\r
  86     SequenceI[] namedseqs = null;\r
  87     int i = 0;\r
  88     SeqIdName nam;\r
  89 \r
  90     if (seqs.length > 0)\r
  91     {\r
  92       namedseqs = new SequenceI[seqs.length];\r
  93       do\r
  94       {\r
  95         nam = new SeqIdName(seqs[i].getName());\r
  96 \r
  97         if (names.containsKey(nam))\r
  98         {\r
  99           namedseqs[i] = (SequenceI) names.get(nam);\r
 100         }\r
 101         else\r
 102         {\r
 103           namedseqs[i] = null;\r
 104         }\r
 105       }\r
 106       while (++i < seqs.length);\r
 107     }\r
 108 \r
 109     return namedseqs;\r
 110   }\r
 111 \r
 112   private class SeqIdName\r
 113   {\r
 114     String id;\r
 115 \r
 116     SeqIdName(String s)\r
 117     {\r
 118       if (s!=null)\r
 119         id = new String(s);\r
 120       else\r
 121         id = "";\r
 122     }\r
 123 \r
 124     public int hashCode()\r
 125     {\r
 126       return ((id.length()>=4) ? id.substring(0, 4).hashCode() : id.hashCode());\r
 127     }\r
 128 \r
 129     public boolean equals(Object s)\r
 130     {\r
 131       if (s instanceof SeqIdName)\r
 132       {\r
 133         return this.equals( (SeqIdName) s);\r
 134       }\r
 135       else\r
 136       {\r
 137         if (s instanceof String)\r
 138         {\r
 139           return this.equals( (String) s);\r
 140         }\r
 141       }\r
 142 \r
 143       return false;\r
 144     }\r
 145 \r
 146     /**\r
 147      * Characters that define the end of a unique sequence ID at\r
 148      * the beginning of an arbitrary ID string\r
 149      * JBPNote: This is a heuristic that will fail for arbritrarily extended sequence id's\r
 150      * (like portions of an aligned set of repeats from one sequence)\r
 151      */\r
 152     private String WORD_SEP="~. |#\\/<>!\"£$%^*)}[@',?";\r
 153 \r
 154    /**\r
 155     * matches if one ID properly contains another at a whitespace boundary.\r
 156     * TODO: (JBPNote) These are not efficient. should use char[] for speed\r
 157     * todo: (JBPNote) Set separator characters appropriately\r
 158     * @param s SeqIdName\r
 159     * @return boolean\r
 160     */\r
 161     public boolean equals(SeqIdName s)\r
 162     {\r
 163       if (id.length()>s.id.length()) {\r
 164         return id.startsWith(s.id) ?\r
 165             (WORD_SEP.indexOf(id.charAt(s.id.length()))>-1)\r
 166             : false;\r
 167       } else\r
 168         return s.id.startsWith(id) ?\r
 169             (s.id.equals(id) ? true :\r
 170              (WORD_SEP.indexOf(s.id.charAt(id.length()))>-1))\r
 171             : false;\r
 172     }\r
 173 \r
 174     public boolean equals(String s)\r
 175     {\r
 176       if (id.length()>s.length()) {\r
 177         return id.startsWith(s) ?\r
 178             (WORD_SEP.indexOf(id.charAt(s.length()))>-1)\r
 179             : false;\r
 180       } else\r
 181         return s.startsWith(id) ?\r
 182             (s.equals(id) ? true :\r
 183              (WORD_SEP.indexOf(s.charAt(id.length()))>-1))\r
 184             : false;\r
 185     }\r
 186   }\r
 187 }\r