1 /*******************************************************************************
2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.7)
3 * Copyright (C) 2011 J Procter, AM Waterhouse, J Engelhardt, LM Lui, G Barton, M Clamp, S Searle
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
11 * Jalview is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
14 * PURPOSE. See the GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along with Jalview. If not, see <http://www.gnu.org/licenses/>.
17 *******************************************************************************/
20 import java.util.ArrayList;
21 import java.util.StringTokenizer;
22 import java.util.regex.Pattern;
26 * utility class for dealing with HTML link extraction
30 public class ParseHtmlBodyAndLinks
33 public String getOrig()
37 boolean htmlContent=true;
39 * @return true if the content looked like HTML
42 public boolean isHtmlContent()
47 ArrayList<String> links=new ArrayList<String>();
48 StringBuffer sb = new StringBuffer();
50 * result of parsing description - with or without HTML tags
53 public String getContent()
59 * list of Label|Link encoded URL links extracted from HTML
62 public ArrayList<String> getLinks() {
68 * @param description - html or text content to be parsed
69 * @param removeHTML flag to indicate if HTML tags should be removed if they are present.
72 public ParseHtmlBodyAndLinks(String description,
73 boolean removeHTML, String newline)
75 if (description==null || description.length()==0)
80 if (description.toUpperCase().indexOf("<HTML>") == -1)
85 StringTokenizer st = new StringTokenizer(description, "<");
89 while (st.hasMoreElements())
91 token = st.nextToken("&>");
92 if (token.equalsIgnoreCase("html") || token.startsWith("/"))
98 startTag = token.indexOf("<");
102 tag = token.substring(startTag + 1);
103 token = token.substring(0, startTag);
106 if (tag != null && tag.toUpperCase().startsWith("A HREF="))
108 if (token.length() > 0)
112 link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);
113 String label = st.nextToken("<>");
114 links.add(label + "|" + link);
115 sb.append(label + "%LINK%");
117 else if (tag != null && tag.equalsIgnoreCase("br"))
121 else if (token.startsWith("lt;"))
123 sb.append("<" + token.substring(3));
125 else if (token.startsWith("gt;"))
127 sb.append(">" + token.substring(3));
129 else if (token.startsWith("amp;"))
131 sb.append("&" + token.substring(4));
138 if (removeHTML && !htmlContent)
140 // instead of parsing the html into plaintext
141 // clean the description ready for embedding in html
142 sb = new StringBuffer(Pattern.compile("<").matcher(description).replaceAll("<"));
148 * get either the parsed content or the original, depending on whether the original looked like html content or not.
151 public String getNonHtmlContent()
153 return isHtmlContent() ? sb.toString() : orig;