2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import java.util.Locale;
25 import java.util.ArrayList;
26 import java.util.List;
27 import java.util.StringTokenizer;
28 import java.util.regex.Pattern;
31 * utility class for dealing with HTML link extraction
36 public class ParseHtmlBodyAndLinks
38 private static final Pattern LEFT_ANGLE_BRACKET_PATTERN = Pattern
43 public String getOrig()
48 boolean htmlContent = true;
51 * @return true if the content looked like HTML
53 public boolean isHtmlContent()
58 List<String> links = new ArrayList<String>();
63 * result of parsing description - with or without HTML tags
67 public String getContent()
74 * list of Label|Link encoded URL links extracted from HTML
78 public List<String> getLinks()
84 * Parses the given html and
86 * <li>extracts any 'href' links to a list of "displayName|url" strings,
87 * retrievable by #getLinks</li>
88 * <li>extracts the remaining text (with %LINK% placeholders replacing hrefs),
89 * retrievable by #getContent</li>
93 * - html or text content to be parsed
95 * flag to indicate if HTML tags should be removed if they are
99 public ParseHtmlBodyAndLinks(String description, boolean removeHTML,
102 if (description == null || description.length() == 0)
107 StringBuilder sb = new StringBuilder(description.length());
108 if (description.toUpperCase(Locale.ROOT).indexOf("<HTML>") == -1)
113 StringTokenizer st = new StringTokenizer(description, "<");
117 while (st.hasMoreElements())
119 token = st.nextToken(">");
120 if (token.equalsIgnoreCase("html") || token.startsWith("/"))
126 startTag = token.indexOf("<");
130 tag = token.substring(startTag + 1);
131 token = token.substring(0, startTag);
134 if (tag != null && tag.toUpperCase(Locale.ROOT).startsWith("A HREF="))
136 if (token.length() > 0)
140 link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);
141 String label = st.nextToken("<>");
142 links.add(label + "|" + link);
143 sb.append(label + "%LINK%");
145 else if (tag != null && tag.equalsIgnoreCase("br"))
154 if (removeHTML && !htmlContent)
156 // instead of parsing the html into plaintext
157 // clean the description ready for embedding in html
158 sb = new StringBuilder(LEFT_ANGLE_BRACKET_PATTERN.matcher(description)
159 .replaceAll("<"));
161 content = translateEntities(sb.toString());
164 private String translateEntities(String s)
166 s = s.replaceAll("&", "&");
167 s = s.replaceAll("<", "<");
168 s = s.replaceAll(">", ">");
173 * get either the parsed content or the original, depending on whether the
174 * original looked like html content or not.
178 public String getNonHtmlContent()
180 return isHtmlContent() ? content : orig;