JAL-908 JAL-701 refactored HTML escape/unescape code for sequence feature descriptions
authorjprocter <jprocter@compbio.dundee.ac.uk>
Fri, 19 Aug 2011 10:03:02 +0000 (11:03 +0100)
committerjprocter <jprocter@compbio.dundee.ac.uk>
Fri, 19 Aug 2011 10:03:02 +0000 (11:03 +0100)
src/jalview/io/FeaturesFile.java
src/jalview/util/ParseHtmlBodyAndLinks.java [new file with mode: 0644]

index 34253cb..c3640bf 100755 (executable)
@@ -633,71 +633,12 @@ public class FeaturesFile extends AlignFile
     {\r
       return;\r
     }\r
+    jalview.util.ParseHtmlBodyAndLinks parsed = new jalview.util.ParseHtmlBodyAndLinks(sf.getDescription(), removeHTML, newline);\r
 \r
-    if (removeHTML\r
-            && sf.getDescription().toUpperCase().indexOf("<HTML>") == -1)\r
+    sf.description = (removeHTML) ? parsed.getNonHtmlContent() : sf.description;\r
+    for (String link:parsed.getLinks())\r
     {\r
-      removeHTML = false;\r
-    }\r
-\r
-    StringBuffer sb = new StringBuffer();\r
-    StringTokenizer st = new StringTokenizer(sf.getDescription(), "<");\r
-    String token, link;\r
-    int startTag;\r
-    String tag = null;\r
-    while (st.hasMoreElements())\r
-    {\r
-      token = st.nextToken("&>");\r
-      if (token.equalsIgnoreCase("html") || token.startsWith("/"))\r
-      {\r
-        continue;\r
-      }\r
-\r
-      tag = null;\r
-      startTag = token.indexOf("<");\r
-\r
-      if (startTag > -1)\r
-      {\r
-        tag = token.substring(startTag + 1);\r
-        token = token.substring(0, startTag);\r
-      }\r
-\r
-      if (tag != null && tag.toUpperCase().startsWith("A HREF="))\r
-      {\r
-        if (token.length() > 0)\r
-        {\r
-          sb.append(token);\r
-        }\r
-        link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);\r
-        String label = st.nextToken("<>");\r
-        sf.addLink(label + "|" + link);\r
-        sb.append(label + "%LINK%");\r
-      }\r
-      else if (tag != null && tag.equalsIgnoreCase("br"))\r
-      {\r
-        sb.append(newline);\r
-      }\r
-      else if (token.startsWith("lt;"))\r
-      {\r
-        sb.append("<" + token.substring(3));\r
-      }\r
-      else if (token.startsWith("gt;"))\r
-      {\r
-        sb.append(">" + token.substring(3));\r
-      }\r
-      else if (token.startsWith("amp;"))\r
-      {\r
-        sb.append("&" + token.substring(4));\r
-      }\r
-      else\r
-      {\r
-        sb.append(token);\r
-      }\r
-    }\r
-\r
-    if (removeHTML)\r
-    {\r
-      sf.description = sb.toString();\r
+      sf.addLink(link);\r
     }\r
 \r
   }\r
diff --git a/src/jalview/util/ParseHtmlBodyAndLinks.java b/src/jalview/util/ParseHtmlBodyAndLinks.java
new file mode 100644 (file)
index 0000000..5cb0a46
--- /dev/null
@@ -0,0 +1,139 @@
+package jalview.util;
+
+import java.util.ArrayList;
+import java.util.StringTokenizer;
+import java.util.regex.Pattern;
+
+
+/**
+ * utility class for dealing with HTML link extraction
+ * @author jprocter
+ *
+ */
+public class ParseHtmlBodyAndLinks
+  {
+    String orig=null;
+    public String getOrig()
+    {
+      return orig;
+    }
+    boolean htmlContent=true;
+    /**
+     * @return true if the content looked like HTML
+
+     */
+    public boolean isHtmlContent()
+    {
+      return htmlContent;
+    }
+
+    ArrayList<String> links=new ArrayList<String>();
+    StringBuffer sb = new StringBuffer();
+    /**
+     * result of parsing description - with or without HTML tags
+     * @return
+     */
+    public String getContent()
+    {
+      
+      return sb.toString();
+    }
+    /**
+     * list of Label|Link encoded URL links extracted from HTML 
+     * @return
+     */
+    public ArrayList<String> getLinks() {
+      return links;
+    }
+
+    /**
+     * 
+     * @param description - html or text content to be parsed
+     * @param removeHTML flag to indicate if HTML tags should be removed if they are present.
+     * @param newline
+     */
+    public ParseHtmlBodyAndLinks(String description,
+          boolean removeHTML, String newline)
+    {
+      if (description==null || description.length()==0)
+      {
+        htmlContent=false;
+        return;
+      }
+    if (description.toUpperCase().indexOf("<HTML>") == -1)
+    {
+      htmlContent = false;
+    }
+    orig = description;
+    StringTokenizer st = new StringTokenizer(description, "<");
+    String token, link;
+    int startTag;
+    String tag = null;
+    while (st.hasMoreElements())
+    {
+      token = st.nextToken("&>");
+      if (token.equalsIgnoreCase("html") || token.startsWith("/"))
+      {
+        continue;
+      }
+
+      tag = null;
+      startTag = token.indexOf("<");
+
+      if (startTag > -1)
+      {
+        tag = token.substring(startTag + 1);
+        token = token.substring(0, startTag);
+      }
+
+      if (tag != null && tag.toUpperCase().startsWith("A HREF="))
+      {
+        if (token.length() > 0)
+        {
+          sb.append(token);
+        }
+        link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);
+        String label = st.nextToken("<>");
+        links.add(label + "|" + link);
+        sb.append(label + "%LINK%");
+      }
+      else if (tag != null && tag.equalsIgnoreCase("br"))
+      {
+        sb.append(newline);
+      }
+      else if (token.startsWith("lt;"))
+      {
+        sb.append("<" + token.substring(3));
+      }
+      else if (token.startsWith("gt;"))
+      {
+        sb.append(">" + token.substring(3));
+      }
+      else if (token.startsWith("amp;"))
+      {
+        sb.append("&" + token.substring(4));
+      }
+      else
+      {
+        sb.append(token);
+      }
+    }
+    if (removeHTML && !htmlContent)
+    {
+      // instead of parsing the html into plaintext
+      // clean the description ready for embedding in html
+      sb = new StringBuffer(Pattern.compile("<").matcher(description).replaceAll("&lt;"));        
+      
+    }
+    
+  }
+    /**
+     * get either the parsed content or the original, depending on whether the original looked like html content or not.
+     * @return
+     */
+    public String getNonHtmlContent()
+    {
+      return isHtmlContent() ? sb.toString() : orig;
+    }
+
+}