From: jprocter <jprocter@compbio.dundee.ac.uk>
Date: Fri, 19 Aug 2011 10:03:02 +0000 (+0100)
Subject: JAL-908 JAL-701 refactored HTML escape/unescape code for sequence feature descriptions
X-Git-Tag: Release_2_7~93
X-Git-Url: http://source.jalview.org/gitweb/?p=jalview.git;a=commitdiff_plain;h=193d05af3d6716a7e7652f208d41a016a0039396

JAL-908 JAL-701 refactored HTML escape/unescape code for sequence feature descriptions
---

diff --git a/src/jalview/io/FeaturesFile.java b/src/jalview/io/FeaturesFile.java
index 34253cb..c3640bf 100755
--- a/src/jalview/io/FeaturesFile.java
+++ b/src/jalview/io/FeaturesFile.java
@@ -633,71 +633,12 @@ public class FeaturesFile extends AlignFile
     {
       return;
     }
+    jalview.util.ParseHtmlBodyAndLinks parsed = new jalview.util.ParseHtmlBodyAndLinks(sf.getDescription(), removeHTML, newline);
 
-    if (removeHTML
-            && sf.getDescription().toUpperCase().indexOf("<HTML>") == -1)
+    sf.description = (removeHTML) ? parsed.getNonHtmlContent() : sf.description;
+    for (String link:parsed.getLinks())
     {
-      removeHTML = false;
-    }
-
-    StringBuffer sb = new StringBuffer();
-    StringTokenizer st = new StringTokenizer(sf.getDescription(), "<");
-    String token, link;
-    int startTag;
-    String tag = null;
-    while (st.hasMoreElements())
-    {
-      token = st.nextToken("&>");
-      if (token.equalsIgnoreCase("html") || token.startsWith("/"))
-      {
-        continue;
-      }
-
-      tag = null;
-      startTag = token.indexOf("<");
-
-      if (startTag > -1)
-      {
-        tag = token.substring(startTag + 1);
-        token = token.substring(0, startTag);
-      }
-
-      if (tag != null && tag.toUpperCase().startsWith("A HREF="))
-      {
-        if (token.length() > 0)
-        {
-          sb.append(token);
-        }
-        link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);
-        String label = st.nextToken("<>");
-        sf.addLink(label + "|" + link);
-        sb.append(label + "%LINK%");
-      }
-      else if (tag != null && tag.equalsIgnoreCase("br"))
-      {
-        sb.append(newline);
-      }
-      else if (token.startsWith("lt;"))
-      {
-        sb.append("<" + token.substring(3));
-      }
-      else if (token.startsWith("gt;"))
-      {
-        sb.append(">" + token.substring(3));
-      }
-      else if (token.startsWith("amp;"))
-      {
-        sb.append("&" + token.substring(4));
-      }
-      else
-      {
-        sb.append(token);
-      }
-    }
-
-    if (removeHTML)
-    {
-      sf.description = sb.toString();
+      sf.addLink(link);
     }
 
   }
diff --git a/src/jalview/util/ParseHtmlBodyAndLinks.java b/src/jalview/util/ParseHtmlBodyAndLinks.java
new file mode 100644
index 0000000..5cb0a46
--- /dev/null
+++ b/src/jalview/util/ParseHtmlBodyAndLinks.java
@@ -0,0 +1,139 @@
+package jalview.util;
+
+import java.util.ArrayList;
+import java.util.StringTokenizer;
+import java.util.regex.Pattern;
+
+
+/**
+ * utility class for dealing with HTML link extraction
+ * @author jprocter
+ *
+ */
+public class ParseHtmlBodyAndLinks
+  {
+    String orig=null;
+    public String getOrig()
+    {
+      return orig;
+    }
+    boolean htmlContent=true;
+    /**
+     * @return true if the content looked like HTML
+
+     */
+    public boolean isHtmlContent()
+    {
+      return htmlContent;
+    }
+
+    ArrayList<String> links=new ArrayList<String>();
+    StringBuffer sb = new StringBuffer();
+    /**
+     * result of parsing description - with or without HTML tags
+     * @return
+     */
+    public String getContent()
+    {
+      
+      return sb.toString();
+    }
+    /**
+     * list of Label|Link encoded URL links extracted from HTML 
+     * @return
+     */
+    public ArrayList<String> getLinks() {
+      return links;
+    }
+
+    /**
+     * 
+     * @param description - html or text content to be parsed
+     * @param removeHTML flag to indicate if HTML tags should be removed if they are present.
+     * @param newline
+     */
+    public ParseHtmlBodyAndLinks(String description,
+          boolean removeHTML, String newline)
+    {
+      if (description==null || description.length()==0)
+      {
+        htmlContent=false;
+        return;
+      }
+    if (description.toUpperCase().indexOf("<HTML>") == -1)
+    {
+      htmlContent = false;
+    }
+    orig = description;
+    StringTokenizer st = new StringTokenizer(description, "<");
+    String token, link;
+    int startTag;
+    String tag = null;
+    while (st.hasMoreElements())
+    {
+      token = st.nextToken("&>");
+      if (token.equalsIgnoreCase("html") || token.startsWith("/"))
+      {
+        continue;
+      }
+
+      tag = null;
+      startTag = token.indexOf("<");
+
+      if (startTag > -1)
+      {
+        tag = token.substring(startTag + 1);
+        token = token.substring(0, startTag);
+      }
+
+      if (tag != null && tag.toUpperCase().startsWith("A HREF="))
+      {
+        if (token.length() > 0)
+        {
+          sb.append(token);
+        }
+        link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);
+        String label = st.nextToken("<>");
+        links.add(label + "|" + link);
+        sb.append(label + "%LINK%");
+      }
+      else if (tag != null && tag.equalsIgnoreCase("br"))
+      {
+        sb.append(newline);
+      }
+      else if (token.startsWith("lt;"))
+      {
+        sb.append("<" + token.substring(3));
+      }
+      else if (token.startsWith("gt;"))
+      {
+        sb.append(">" + token.substring(3));
+      }
+      else if (token.startsWith("amp;"))
+      {
+        sb.append("&" + token.substring(4));
+      }
+      else
+      {
+        sb.append(token);
+      }
+    }
+    if (removeHTML && !htmlContent)
+    {
+      // instead of parsing the html into plaintext
+      // clean the description ready for embedding in html
+      sb = new StringBuffer(Pattern.compile("<").matcher(description).replaceAll("&lt;"));        
+      
+    }
+    
+  }
+    /**
+     * get either the parsed content or the original, depending on whether the original looked like html content or not.
+     * @return
+     */
+    public String getNonHtmlContent()
+    {
+      return isHtmlContent() ? sb.toString() : orig;
+    }
+
+}