{\r
return;\r
}\r
+ jalview.util.ParseHtmlBodyAndLinks parsed = new jalview.util.ParseHtmlBodyAndLinks(sf.getDescription(), removeHTML, newline);\r
\r
- if (removeHTML\r
- && sf.getDescription().toUpperCase().indexOf("<HTML>") == -1)\r
+ sf.description = (removeHTML) ? parsed.getNonHtmlContent() : sf.description;\r
+ for (String link:parsed.getLinks())\r
{\r
- removeHTML = false;\r
- }\r
-\r
- StringBuffer sb = new StringBuffer();\r
- StringTokenizer st = new StringTokenizer(sf.getDescription(), "<");\r
- String token, link;\r
- int startTag;\r
- String tag = null;\r
- while (st.hasMoreElements())\r
- {\r
- token = st.nextToken("&>");\r
- if (token.equalsIgnoreCase("html") || token.startsWith("/"))\r
- {\r
- continue;\r
- }\r
-\r
- tag = null;\r
- startTag = token.indexOf("<");\r
-\r
- if (startTag > -1)\r
- {\r
- tag = token.substring(startTag + 1);\r
- token = token.substring(0, startTag);\r
- }\r
-\r
- if (tag != null && tag.toUpperCase().startsWith("A HREF="))\r
- {\r
- if (token.length() > 0)\r
- {\r
- sb.append(token);\r
- }\r
- link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);\r
- String label = st.nextToken("<>");\r
- sf.addLink(label + "|" + link);\r
- sb.append(label + "%LINK%");\r
- }\r
- else if (tag != null && tag.equalsIgnoreCase("br"))\r
- {\r
- sb.append(newline);\r
- }\r
- else if (token.startsWith("lt;"))\r
- {\r
- sb.append("<" + token.substring(3));\r
- }\r
- else if (token.startsWith("gt;"))\r
- {\r
- sb.append(">" + token.substring(3));\r
- }\r
- else if (token.startsWith("amp;"))\r
- {\r
- sb.append("&" + token.substring(4));\r
- }\r
- else\r
- {\r
- sb.append(token);\r
- }\r
- }\r
-\r
- if (removeHTML)\r
- {\r
- sf.description = sb.toString();\r
+ sf.addLink(link);\r
}\r
\r
}\r
--- /dev/null
+package jalview.util;
+
+import java.util.ArrayList;
+import java.util.StringTokenizer;
+import java.util.regex.Pattern;
+
+
+/**
+ * utility class for dealing with HTML link extraction
+ * @author jprocter
+ *
+ */
+public class ParseHtmlBodyAndLinks
+ {
+ String orig=null;
+ public String getOrig()
+ {
+ return orig;
+ }
+ boolean htmlContent=true;
+ /**
+ * @return true if the content looked like HTML
+
+ */
+ public boolean isHtmlContent()
+ {
+ return htmlContent;
+ }
+
+ ArrayList<String> links=new ArrayList<String>();
+ StringBuffer sb = new StringBuffer();
+ /**
+ * result of parsing description - with or without HTML tags
+ * @return
+ */
+ public String getContent()
+ {
+
+ return sb.toString();
+ }
+ /**
+ * list of Label|Link encoded URL links extracted from HTML
+ * @return
+ */
+ public ArrayList<String> getLinks() {
+ return links;
+ }
+
+ /**
+ *
+ * @param description - html or text content to be parsed
+ * @param removeHTML flag to indicate if HTML tags should be removed if they are present.
+ * @param newline
+ */
+ public ParseHtmlBodyAndLinks(String description,
+ boolean removeHTML, String newline)
+ {
+ if (description==null || description.length()==0)
+ {
+ htmlContent=false;
+ return;
+ }
+ if (description.toUpperCase().indexOf("<HTML>") == -1)
+ {
+ htmlContent = false;
+ }
+ orig = description;
+ StringTokenizer st = new StringTokenizer(description, "<");
+ String token, link;
+ int startTag;
+ String tag = null;
+ while (st.hasMoreElements())
+ {
+ token = st.nextToken("&>");
+ if (token.equalsIgnoreCase("html") || token.startsWith("/"))
+ {
+ continue;
+ }
+
+ tag = null;
+ startTag = token.indexOf("<");
+
+ if (startTag > -1)
+ {
+ tag = token.substring(startTag + 1);
+ token = token.substring(0, startTag);
+ }
+
+ if (tag != null && tag.toUpperCase().startsWith("A HREF="))
+ {
+ if (token.length() > 0)
+ {
+ sb.append(token);
+ }
+ link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);
+ String label = st.nextToken("<>");
+ links.add(label + "|" + link);
+ sb.append(label + "%LINK%");
+ }
+ else if (tag != null && tag.equalsIgnoreCase("br"))
+ {
+ sb.append(newline);
+ }
+ else if (token.startsWith("lt;"))
+ {
+ sb.append("<" + token.substring(3));
+ }
+ else if (token.startsWith("gt;"))
+ {
+ sb.append(">" + token.substring(3));
+ }
+ else if (token.startsWith("amp;"))
+ {
+ sb.append("&" + token.substring(4));
+ }
+ else
+ {
+ sb.append(token);
+ }
+ }
+ if (removeHTML && !htmlContent)
+ {
+ // instead of parsing the html into plaintext
+ // clean the description ready for embedding in html
+ sb = new StringBuffer(Pattern.compile("<").matcher(description).replaceAll("<"));
+
+ }
+
+ }
+ /**
+ * get either the parsed content or the original, depending on whether the original looked like html content or not.
+ * @return
+ */
+ public String getNonHtmlContent()
+ {
+ return isHtmlContent() ? sb.toString() : orig;
+ }
+
+}