JAL-908 JAL-701 refactored HTML escape/unescape code for sequence feature descriptions

[jalview.git] / src / jalview / util / ParseHtmlBodyAndLinks.java
diff --git a/src/jalview/util/ParseHtmlBodyAndLinks.java b/src/jalview/util/ParseHtmlBodyAndLinks.java

new file mode 100644 (file)

index 0000000..5cb0a46
--- /dev/null
+++ b/src/jalview/util/ParseHtmlBodyAndLinks.java
@@ -0,0 +1,139 @@
+package jalview.util;
+
+import java.util.ArrayList;
+import java.util.StringTokenizer;
+import java.util.regex.Pattern;
+
+
+/**
+ * utility class for dealing with HTML link extraction
+ * @author jprocter
+ *
+ */
+public class ParseHtmlBodyAndLinks
+  {
+    String orig=null;
+    public String getOrig()
+    {
+      return orig;
+    }
+    boolean htmlContent=true;
+    /**
+     * @return true if the content looked like HTML
+
+     */
+    public boolean isHtmlContent()
+    {
+      return htmlContent;
+    }
+
+    ArrayList<String> links=new ArrayList<String>();
+    StringBuffer sb = new StringBuffer();
+    /**
+     * result of parsing description - with or without HTML tags
+     * @return
+     */
+    public String getContent()
+    {
+      
+      return sb.toString();
+    }
+    /**
+     * list of Label|Link encoded URL links extracted from HTML 
+     * @return
+     */
+    public ArrayList<String> getLinks() {
+      return links;
+    }
+
+    /**
+     * 
+     * @param description - html or text content to be parsed
+     * @param removeHTML flag to indicate if HTML tags should be removed if they are present.
+     * @param newline
+     */
+    public ParseHtmlBodyAndLinks(String description,
+          boolean removeHTML, String newline)
+    {
+      if (description==null || description.length()==0)
+      {
+        htmlContent=false;
+        return;
+      }
+    if (description.toUpperCase().indexOf("<HTML>") == -1)
+    {
+      htmlContent = false;
+    }
+    orig = description;
+    StringTokenizer st = new StringTokenizer(description, "<");
+    String token, link;
+    int startTag;
+    String tag = null;
+    while (st.hasMoreElements())
+    {
+      token = st.nextToken("&>");
+      if (token.equalsIgnoreCase("html") || token.startsWith("/"))
+      {
+        continue;
+      }
+
+      tag = null;
+      startTag = token.indexOf("<");
+
+      if (startTag > -1)
+      {
+        tag = token.substring(startTag + 1);
+        token = token.substring(0, startTag);
+      }
+
+      if (tag != null && tag.toUpperCase().startsWith("A HREF="))
+      {
+        if (token.length() > 0)
+        {
+          sb.append(token);
+        }
+        link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);
+        String label = st.nextToken("<>");
+        links.add(label + "|" + link);
+        sb.append(label + "%LINK%");
+      }
+      else if (tag != null && tag.equalsIgnoreCase("br"))
+      {
+        sb.append(newline);
+      }
+      else if (token.startsWith("lt;"))
+      {
+        sb.append("<" + token.substring(3));
+      }
+      else if (token.startsWith("gt;"))
+      {
+        sb.append(">" + token.substring(3));
+      }
+      else if (token.startsWith("amp;"))
+      {
+        sb.append("&" + token.substring(4));
+      }
+      else
+      {
+        sb.append(token);
+      }
+    }
+    if (removeHTML && !htmlContent)
+    {
+      // instead of parsing the html into plaintext
+      // clean the description ready for embedding in html
+      sb = new StringBuffer(Pattern.compile("<").matcher(description).replaceAll("&lt;"));        
+      
+    }
+    
+  }
+    /**
+     * get either the parsed content or the original, depending on whether the original looked like html content or not.
+     * @return
+     */
+    public String getNonHtmlContent()
+    {
+      return isHtmlContent() ? sb.toString() : orig;
+    }
+
+}