2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.2)
3 * Copyright (C) 2014 The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import java.util.ArrayList;
24 import java.util.StringTokenizer;
25 import java.util.regex.Pattern;
28 * utility class for dealing with HTML link extraction
33 public class ParseHtmlBodyAndLinks
37 public String getOrig()
42 boolean htmlContent = true;
45 * @return true if the content looked like HTML
47 public boolean isHtmlContent()
52 ArrayList<String> links = new ArrayList<String>();
54 StringBuffer sb = new StringBuffer();
57 * result of parsing description - with or without HTML tags
61 public String getContent()
68 * list of Label|Link encoded URL links extracted from HTML
72 public ArrayList<String> getLinks()
80 * - html or text content to be parsed
82 * flag to indicate if HTML tags should be removed if they are
86 public ParseHtmlBodyAndLinks(String description, boolean removeHTML,
89 if (description == null || description.length() == 0)
94 if (description.toUpperCase().indexOf("<HTML>") == -1)
99 StringTokenizer st = new StringTokenizer(description, "<");
103 while (st.hasMoreElements())
105 token = st.nextToken("&>");
106 if (token.equalsIgnoreCase("html") || token.startsWith("/"))
112 startTag = token.indexOf("<");
116 tag = token.substring(startTag + 1);
117 token = token.substring(0, startTag);
120 if (tag != null && tag.toUpperCase().startsWith("A HREF="))
122 if (token.length() > 0)
126 link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);
127 String label = st.nextToken("<>");
128 links.add(label + "|" + link);
129 sb.append(label + "%LINK%");
131 else if (tag != null && tag.equalsIgnoreCase("br"))
135 else if (token.startsWith("lt;"))
137 sb.append("<" + token.substring(3));
139 else if (token.startsWith("gt;"))
141 sb.append(">" + token.substring(3));
143 else if (token.startsWith("amp;"))
145 sb.append("&" + token.substring(4));
152 if (removeHTML && !htmlContent)
154 // instead of parsing the html into plaintext
155 // clean the description ready for embedding in html
156 sb = new StringBuffer(Pattern.compile("<").matcher(description)
157 .replaceAll("<"));
164 * get either the parsed content or the original, depending on whether the
165 * original looked like html content or not.
169 public String getNonHtmlContent()
171 return isHtmlContent() ? sb.toString() : orig;