2 * Jalview - A Sequence Alignment Editor and Viewer (Development Version 2.4.1)
3 * Copyright (C) 2009 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
21 import jalview.datamodel.Sequence;
22 import jalview.datamodel.SequenceI;
24 import java.util.Hashtable;
25 import java.util.Vector;
27 public class GroupUrlLink
30 * Helper class based on the UrlLink class which enables URLs to be
31 * constructed from sequences or IDs associated with a group of sequences. URL
32 * definitions consist of a pipe separated string containing a <label>|<url
33 * construct>|<separator character>[|<sequence separator character>]. The url
34 * construct includes regex qualified tokens which are replaced with seuqence
35 * IDs ($SEQUENCE_IDS$) and/or seuqence regions ($SEQUENCES$) that are
36 * extracted from the group. See <code>UrlLink</code> for more information
37 * about the approach, and the original implementation.
40 private String url_prefix, target, label;
43 * these are all filled in order of the occurence of each token in the url
46 private String url_suffix[], separators[], regexReplace[];
48 private String invalidMessage = null;
51 * tokens that can be replaced in the URL.
53 private static String[] tokens;
56 * position of each token (which can appear once only) in the url
61 * contains tokens in the order they appear in the URL template.
63 private String[] mtch;
69 { "SEQUENCEIDS", "SEQUENCES", "DATASETID" };
73 // private int idseg = -1, seqseg = -1;
76 * parse the given linkString of the form '<label>|<url>|separator
77 * char[|optional sequence separator char]' into parts. url may contain a
78 * string $SEQUENCEIDS<=optional regex=>$ where <=optional regex=> must be of
79 * the form =/<perl style regex>/=$ or $SEQUENCES<=optional regex=>$ or
80 * $SEQUENCES<=optional regex=>$.
84 public GroupUrlLink(String link)
86 int sep = link.indexOf("|");
87 segs = new int[tokens.length];
89 for (int i = 0; i < segs.length; i++)
91 if ((segs[i] = link.indexOf("$" + tokens[i])) > -1)
96 // expect at least one token
99 invalidMessage = "Group URL string must contain at least one of ";
100 for (int i = 0; i < segs.length; i++)
102 invalidMessage += " '$" + tokens[i] + "[=/regex=/]$'";
107 int[] ptok = new int[ntoks + 1];
108 String[] tmtch = new String[ntoks + 1];
109 mtch = new String[ntoks];
110 for (int i = 0, t = 0; i < segs.length; i++)
115 tmtch[t++] = tokens[i];
118 ptok[ntoks] = link.length();
119 tmtch[ntoks] = "$$$$$$$$$";
120 jalview.util.QuickSort.sort(ptok, tmtch);
121 for (int i = 0; i < ntoks; i++)
123 mtch[i] = tmtch[i]; // TODO: check order is ascending
126 * replaces the specific code below {}; if (psqids > -1 && pseqs > -1) { if
127 * (psqids > pseqs) { idseg = 1; seqseg = 0;
129 * ptok = new int[] { pseqs, psqids, link.length() }; mtch = new String[] {
130 * "$SEQUENCES", "$SEQUENCEIDS" }; } else { idseg = 0; seqseg = 1; ptok =
131 * new int[] { psqids, pseqs, link.length() }; mtch = new String[] {
132 * "$SEQUENCEIDS", "$SEQUENCES" }; } } else { if (psqids != -1) { idseg = 0;
133 * ptok = new int[] { psqids, link.length() }; mtch = new String[] {
134 * "$SEQUENCEIDS" }; } else { seqseg = 0; ptok = new int[] { pseqs,
135 * link.length() }; mtch = new String[] { "$SEQUENCES" }; } }
139 // first get the label and target part before the first |
143 p = link.indexOf("|", sep + 1);
144 } while (p > sep && p < ptok[0]);
145 // Assuming that the URL itself does not contain any '|' symbols
146 // sep now contains last pipe symbol position prior to any regex symbols
147 label = link.substring(0, sep);
148 if (label.indexOf("|") > -1)
150 // | terminated database name / www target at start of Label
151 target = label.substring(0, label.indexOf("|"));
153 else if (label.indexOf(" ") > 2)
155 // space separated Label - matches database name
156 target = label.substring(0, label.indexOf(" "));
162 // Now Parse URL : Whole URL string first
163 url_prefix = link.substring(sep + 1, ptok[0]);
164 url_suffix = new String[mtch.length];
165 regexReplace = new String[mtch.length];
166 // and loop through tokens
167 for (int pass = 0; pass < mtch.length; pass++)
169 int mlength = 3 + mtch[pass].length();
170 if (link.indexOf("$" + mtch[pass] + "=/") == ptok[pass]
171 && (p = link.indexOf("/=$", ptok[pass] + mlength)) > ptok[pass]
174 // Extract Regex and suffix
175 if (ptok[pass + 1] < p + 3)
177 // tokens are not allowed inside other tokens - e.g. inserting a
178 // $sequences$ into the regex match for the sequenceid
179 invalidMessage = "Token regexes cannot contain other regexes (did you terminate the $"
180 + mtch[pass] + " regex with a '/=$' ?";
183 url_suffix[pass] = link.substring(p + 3, ptok[pass + 1]);
184 regexReplace[pass] = link.substring(ptok[pass] + mlength, p);
187 com.stevesoft.pat.Regex rg = com.stevesoft.pat.Regex.perlCode("/"
188 + regexReplace[pass] + "/");
191 invalidMessage = "Invalid Regular Expression : '"
192 + regexReplace[pass] + "'\n";
194 } catch (Exception e)
196 invalidMessage = "Invalid Regular Expression : '"
197 + regexReplace[pass] + "'\n";
202 regexReplace[pass] = null;
203 // verify format is really correct.
204 if ((p = link.indexOf("$" + mtch[pass] + "$")) == ptok[pass])
206 url_suffix[pass] = link.substring(p + mtch[pass].length() + 2,
211 invalidMessage = "Warning: invalid regex structure (after '"
212 + mtch[0] + "') for URL link : " + link;
217 separators = new String[url_suffix.length];
218 String suffices = url_suffix[url_suffix.length - 1], lastsep = ",";
219 // have a look in the last suffix for any more separators.
220 while ((p = suffices.indexOf('|')) > -1)
222 separators[pass] = suffices.substring(p + 1);
225 // trim the original suffix string
226 url_suffix[url_suffix.length - 1] = suffices.substring(0, p);
230 lastsep = (separators[pass - 1] = separators[pass - 1].substring(0,
233 suffices = separators[pass];
238 lastsep = separators[pass - 1];
240 // last separator is always used for all the remaining separators
241 while (pass < separators.length)
243 separators[pass++] = lastsep;
248 * @return the url_suffix
250 public String getUrl_suffix()
252 return url_suffix[url_suffix.length - 1];
256 * @return the url_prefix
258 public String getUrl_prefix()
266 public String getTarget()
274 public String getLabel()
280 * @return the sequence ID regexReplace
282 public String getIDRegexReplace()
284 return _replaceFor(tokens[0]);
287 private String _replaceFor(String token)
289 for (int i = 0; i < mtch.length; i++)
290 if (segs[i] > -1 && mtch[i].equals(token))
292 return regexReplace[i];
298 * @return the sequence ID regexReplace
300 public String getSeqRegexReplace()
302 return _replaceFor(tokens[1]);
306 * @return the invalidMessage
308 public String getInvalidMessage()
310 return invalidMessage;
314 * Check if URL string was parsed properly.
316 * @return boolean - if false then <code>getInvalidMessage</code> returns an
319 public boolean isValid()
321 return invalidMessage == null;
325 * return one or more URL strings by applying regex to the given idstring
328 * array of id strings to pass to service
330 * array of seq strings to pass to service
331 * @param onlyIfMatches
332 * - when true url strings are only made if regex is defined and
333 * matches for all qualified tokens in groupURL - TODO: consider if
334 * onlyIfMatches is really a useful parameter!
335 * @return null or Object[] { int[] { number of seqs substituted},boolean[] {
336 * which seqs were substituted }, StringBuffer[] { substituted lists
337 * for each token }, String[] { url } }
339 public Object[] makeUrls(String[] idstrings, String[] seqstrings,
340 String dsstring, boolean onlyIfMatches)
342 Hashtable rstrings = new Hashtable();
343 rstrings.put(tokens[0], idstrings);
344 rstrings.put(tokens[1], seqstrings);
345 rstrings.put(tokens[2], new String[]
347 if (idstrings.length != seqstrings.length)
350 "idstrings and seqstrings contain one string each per sequence.");
352 return makeUrls(rstrings, onlyIfMatches);
355 public Object[] makeUrls(Hashtable repstrings, boolean onlyIfMatches)
357 // prepare string arrays in correct order to be assembled into URL input
358 String[][] idseq = new String[mtch.length][]; // indexed by pass
359 int mins = 0, maxs = 0; // allowed two values, 1 or n-sequences.
360 for (int i = 0; i < mtch.length; i++)
362 idseq[i] = (String[]) repstrings.get(mtch[i]);
363 if (idseq[i].length >= 1)
365 if (mins == 0 && idseq[i].length == 1)
371 maxs = idseq[i].length;
375 if (maxs != idseq[i].length)
378 "Cannot have mixed length replacement vectors. Replacement vector for "
379 + (mtch[i]) + " is " + idseq[i].length
380 + " strings long, and have already seen a "
381 + maxs + " length vector.");
388 "Cannot have zero length vector of replacement strings - either 1 value or n values.");
392 // iterate through input, collating segments to be inserted into url
393 StringBuffer matched[] = new StringBuffer[idseq.length];
394 // and precompile regexes
395 com.stevesoft.pat.Regex[] rgxs = new com.stevesoft.pat.Regex[matched.length];
396 for (pass = 0; pass < matched.length; pass++)
398 matched[pass] = new StringBuffer();
399 if (regexReplace[pass] != null)
401 rgxs[pass] = com.stevesoft.pat.Regex.perlCode("/" + regexReplace[pass]
409 // record which of the input sequences were actually used to generate the
411 boolean[] thismatched = new boolean[maxs];
413 for (int sq = 0; sq < maxs; sq++)
415 // initialise flag for match
416 thismatched[sq] = false;
417 String[] thematches = new String[rgxs.length];
418 for (pass = 0; pass < rgxs.length; pass++)
420 thematches[pass] = ""; // initialise - in case there are no more
422 // if a regex is provided, then it must match for all sequences in all
423 // tokens for it to be considered.
424 if (idseq[pass].length <= sq)
426 // no more replacement strings to try for this token
429 if (rgxs[pass] != null)
431 com.stevesoft.pat.Regex rg = rgxs[pass];
433 // concatenate all matches of re in the given string!
434 while (rg.searchFrom(idseq[pass][sq], rematchat))
436 rematchat = rg.matchedTo();
437 thismatched[sq] |= true;
438 // do we take the cartesian products of the substituents ?
439 int ns = rg.numSubs();
442 thematches[pass] += rg.stringMatched();// take whole regex
445 * else if (ns==1) { // take only subgroup match return new String[]
446 * { rg.stringMatched(1), url_prefix+rg.stringMatched(1)+url_suffix
449 // deal with multiple submatch case - for moment we do the simplest
450 // - concatenate the matched regions, instead of creating a complete
451 // list for each alternate match over all sequences.
452 // TODO: specify a 'replace pattern' - next refinement
456 for (int s = 0; s <= rg.numSubs(); s++)
458 System.err.println("Sub " + s + " : " + rg.matchedFrom(s)
459 + " : " + rg.matchedTo(s) + " : '"
460 + rg.stringMatched(s) + "'");
462 // try to collate subgroup matches
463 StringBuffer subs = new StringBuffer();
464 // have to loop through submatches, collating them at top level
469 if (s + 1 <= ns && rg.matchedTo(s) > -1
470 && rg.matchedTo(s + 1) > -1
471 && rg.matchedTo(s + 1) < rg.matchedTo(s))
473 // s is top level submatch. search for submatches enclosed by
477 while (r <= ns && rg.matchedTo(r) <= rg.matchedTo(s))
479 if (rg.matchedFrom(r) > -1)
481 rmtch += rg.stringMatched(r);
485 if (rmtch.length() > 0)
487 subs.append(rmtch); // simply concatenate
493 if (rg.matchedFrom(s) > -1)
495 subs.append(rg.stringMatched(s)); // concatenate
500 thematches[pass] += subs.toString();
506 // are we only supposed to take regex matches ?
509 thismatched[sq] |= true;
510 thematches[pass] = idseq[pass][sq]; // take whole string -
511 // regardless - probably not a
514 * TODO: do some boilerplate trimming of the fields to make them
515 * sensible e.g. trim off any 'prefix' in the id string (see UrlLink
516 * for the below) - pre 2.4 Jalview behaviour if
517 * (idstring.indexOf("|") > -1) { idstring =
518 * idstring.substring(idstring.lastIndexOf("|") + 1); }
525 // check if we are going to add this sequence's results ? all token
526 // replacements must be valid for this to happen!
527 // (including single value replacements - eg. dataset name)
530 for (pass = 0; pass < matched.length; pass++)
532 if (idseq[pass].length > 1 && matched[pass].length() > 0)
534 matched[pass].append(separators[pass]);
536 matched[pass].append(thematches[pass]);
541 // finally, if any sequences matched, then form the URL and return
542 if (matched[0].length() == 0)
544 // no matches - no url generated
547 StringBuffer submiturl = new StringBuffer();
548 submiturl.append(url_prefix);
549 for (pass = 0; pass < matched.length; pass++)
551 submiturl.append(matched[pass]);
552 if (url_suffix[pass] != null)
554 submiturl.append(url_suffix[pass]);
560 { seqsmatched }, thismatched, matched, new String[]
561 { submiturl.toString() } };
565 * get token types present in this url as a bitfield indicating presence of each token from tokens (LSB->MSB).
566 * @return groupURL class as integer
568 public int getGroupURLType()
571 for (int pass = 0; pass < tokens.length; pass++)
573 for (int i = 0; i < mtch.length; i++)
575 if (mtch[i].equals(tokens[pass]))
584 public String toString()
586 StringBuffer result = new StringBuffer();
587 result.append(label + "|" + url_prefix);
589 for (r = 0; r < url_suffix.length; r++)
592 result.append(mtch[r]);
593 if (regexReplace[r] != null)
596 result.append(regexReplace[r]);
600 result.append(url_suffix[r]);
602 for (r = 0; r < separators.length; r++)
605 result.append(separators[r]);
607 return result.toString();
611 * report stats about the generated url string given an input set
617 private static void testUrls(GroupUrlLink ul, String[][] idstring,
623 System.out.println("Created NO urls.");
627 System.out.println("Created a url from " + ((int[]) url[0])[0]
628 + "out of " + idstring[0].length + " sequences.");
629 System.out.println("Sequences that did not match:");
630 for (int sq = 0; sq < idstring[0].length; sq++)
632 if (!((boolean[]) url[1])[sq])
634 System.out.println("Seq " + sq + ": " + idstring[0][sq] + "\t: "
638 System.out.println("Sequences that DID match:");
639 for (int sq = 0; sq < idstring[0].length; sq++)
641 if (((boolean[]) url[1])[sq])
643 System.out.println("Seq " + sq + ": " + idstring[0][sq] + "\t: "
647 System.out.println("The generated URL:");
648 System.out.println(((String[]) url[3])[0]);
652 public static void main(String argv[])
654 String[] links = new String[]
656 "EnVision2|IDS|http://www.ebi.ac.uk/enfin-srv/envision2/pages/linkin.jsf?workflow=Enfin%20Default%20Workflow&datasetName=linkInDatasetFromJalview&input=$SEQUENCEIDS$&inputType=0|,",
657 "EnVision2|Seqs|http://www.ebi.ac.uk/enfin-srv/envision2/pages/linkin.jsf?workflow=Enfin%20Default%20Workflow&datasetName=linkInDatasetFromJalview&input=$SEQUENCES$&inputType=1|,",
658 "EnVision2|IDS|http://www.ebi.ac.uk/enfin-srv/envision2/pages/linkin.jsf?workflow=Enfin%20Default%20Workflow&datasetName=$DATASETID$&input=$SEQUENCEIDS$&inputType=0|,",
659 "EnVision2|Seqs|http://www.ebi.ac.uk/enfin-srv/envision2/pages/linkin.jsf?workflow=Enfin%20Default%20Workflow&datasetName=$DATASETID$&input=$SEQUENCES$&inputType=1|,",
660 "EnVision2|IDS|http://www.ebi.ac.uk/enfin-srv/envision2/pages/linkin.jsf?workflow=$SEQUENCEIDS$&datasetName=linkInDatasetFromJalview&input=$SEQUENCEIDS$&inputType=0|,",
661 "EnVision2|Seqs|http://www.ebi.ac.uk/enfin-srv/envision2/pages/linkin.jsf?workflow=$SEQUENCEIDS$&datasetName=$DATASETID$&input=$SEQUENCES$&inputType=1|,",
662 "EnVision2 Seqs|http://www.ebi.ac.uk/enfin-srv/envision2/pages/linkin.jsf?workflow=Default&datasetName=JalviewSeqs$DATASETID$&input=$SEQUENCES=/([a-zA-Z]+)/=$&inputType=1|,",
663 "EnVision2 Seqs|http://www.ebi.ac.uk/enfin-srv/envision2/pages/linkin.jsf?workflow=Default&datasetName=JalviewSeqs$DATASETID$&input=$SEQUENCES=/[A-Za-z]+/=$&inputType=1|,"
665 * http://www.ebi.ac.uk/enfin-srv/envision2/pages/linkin.jsf?input=P38389,P38398
666 * &inputType=0&workflow=Enfin%20Default%20Workflow&datasetName=
667 * linkInDatasetFromPRIDE
671 SequenceI[] seqs = new SequenceI[]
672 { new Sequence("StupidLabel:gi|9234|pdb|102L|A",
673 "asdiasdpasdpadpwpadasdpaspdw"), };
674 String[][] seqsandids = formStrings(seqs);
675 for (int i = 0; i < links.length; i++)
677 GroupUrlLink ul = new GroupUrlLink(links[i]);
680 System.out.println("\n\n\n");
681 System.out.println("Link " + i + " " + links[i] + " : "
683 System.out.println(" pref : " + ul.getUrl_prefix());
684 System.out.println(" IdReplace : " + ul.getIDRegexReplace());
685 System.out.println(" SeqReplace : " + ul.getSeqRegexReplace());
686 System.out.println(" Suffixes : " + ul.getUrl_suffix());
689 .println("<insert input id and sequence strings here> Without onlyIfMatches:");
690 Object[] urls = ul.makeUrls(seqsandids[0], seqsandids[1],
692 testUrls(ul, seqsandids, urls);
694 .println("<insert input id and sequence strings here> With onlyIfMatches set:");
695 urls = ul.makeUrls(seqsandids[0], seqsandids[1], "mydataset", true);
696 testUrls(ul, seqsandids, urls);
700 System.err.println("Invalid URLLink : " + links[i] + " : "
701 + ul.getInvalidMessage());
707 * covenience method to generate the id and sequence string vector from a set
708 * of seuqences using each sequence's getName() and getSequenceAsString()
712 * @return String[][] {{sequence ids},{sequence strings}}
714 public static String[][] formStrings(SequenceI[] seqs)
716 String[][] idset = new String[2][seqs.length];
717 for (int i = 0; i < seqs.length; i++)
719 idset[0][i] = seqs[i].getName();
720 idset[1][i] = seqs[i].getSequenceAsString();
725 public void setLabel(String newlabel)
727 this.label = newlabel;