Merge branch 'patch/JAL-3874_newJmolAndGradleDedup' into develop

[jalview.git] / src / jalview / util / ParseHtmlBodyAndLinks.java
diff --git a/src/jalview/util/ParseHtmlBodyAndLinks.java b/src/jalview/util/ParseHtmlBodyAndLinks.java

index 19726b9..0f2d01d 100644 (file)
--- a/src/jalview/util/ParseHtmlBodyAndLinks.java
+++ b/src/jalview/util/ParseHtmlBodyAndLinks.java
@@ -20,7 +20,10 @@
   */
  package jalview.util;
  
+import java.util.Locale;
+
  import java.util.ArrayList;
+import java.util.List;
  import java.util.StringTokenizer;
  import java.util.regex.Pattern;
  
@@ -32,6 +35,9 @@ import java.util.regex.Pattern;
   */
  public class ParseHtmlBodyAndLinks
  {
+  private static final Pattern LEFT_ANGLE_BRACKET_PATTERN = Pattern
+          .compile("<");
+
    String orig = null;
  
    public String getOrig()
@@ -49,9 +55,9 @@ public class ParseHtmlBodyAndLinks
      return htmlContent;
    }
  
-  ArrayList<String> links = new ArrayList<String>();
+  List<String> links = new ArrayList<String>();
  
-  StringBuffer sb = new StringBuffer();
+  String content;
  
    /**
     * result of parsing description - with or without HTML tags
@@ -61,7 +67,7 @@ public class ParseHtmlBodyAndLinks
    public String getContent()
    {
  
-    return sb.toString();
+    return content;
    }
  
    /**
@@ -69,12 +75,19 @@ public class ParseHtmlBodyAndLinks
     * 
     * @return
     */
-  public ArrayList<String> getLinks()
+  public List<String> getLinks()
    {
      return links;
    }
  
    /**
+   * Parses the given html and
+   * <ul>
+   * <li>extracts any 'href' links to a list of "displayName|url" strings,
+   * retrievable by #getLinks</li>
+   * <li>extracts the remaining text (with %LINK% placeholders replacing hrefs),
+   * retrievable by #getContent</li>
+   * </ul>
     * 
     * @param description
     *          - html or text content to be parsed
@@ -91,7 +104,8 @@ public class ParseHtmlBodyAndLinks
        htmlContent = false;
        return;
      }
-    if (description.toUpperCase().indexOf("<HTML>") == -1)
+    StringBuilder sb = new StringBuilder(description.length());
+    if (description.toUpperCase(Locale.ROOT).indexOf("<HTML>") == -1)
      {
        htmlContent = false;
      }
@@ -102,7 +116,7 @@ public class ParseHtmlBodyAndLinks
      String tag = null;
      while (st.hasMoreElements())
      {
-      token = st.nextToken("&>");
+      token = st.nextToken(">");
        if (token.equalsIgnoreCase("html") || token.startsWith("/"))
        {
          continue;
@@ -117,7 +131,7 @@ public class ParseHtmlBodyAndLinks
          token = token.substring(0, startTag);
        }
  
-      if (tag != null && tag.toUpperCase().startsWith("A HREF="))
+      if (tag != null && tag.toUpperCase(Locale.ROOT).startsWith("A HREF="))
        {
          if (token.length() > 0)
          {
@@ -132,18 +146,6 @@ public class ParseHtmlBodyAndLinks
        {
          sb.append(newline);
        }
-      else if (token.startsWith("lt;"))
-      {
-        sb.append("<" + token.substring(3));
-      }
-      else if (token.startsWith("gt;"))
-      {
-        sb.append(">" + token.substring(3));
-      }
-      else if (token.startsWith("amp;"))
-      {
-        sb.append("&" + token.substring(4));
-      }
        else
        {
          sb.append(token);
@@ -153,11 +155,18 @@ public class ParseHtmlBodyAndLinks
      {
        // instead of parsing the html into plaintext
        // clean the description ready for embedding in html
-      sb = new StringBuffer(Pattern.compile("<").matcher(description)
+      sb = new StringBuilder(LEFT_ANGLE_BRACKET_PATTERN.matcher(description)
                .replaceAll("&lt;"));
-
      }
+    content = translateEntities(sb.toString());
+  }
  
+  private String translateEntities(String s)
+  {
+    s = s.replaceAll("&amp;", "&");
+    s = s.replaceAll("&lt;", "<");
+    s = s.replaceAll("&gt;", ">");
+    return s;
    }
  
    /**
@@ -168,7 +177,7 @@ public class ParseHtmlBodyAndLinks
     */
    public String getNonHtmlContent()
    {
-    return isHtmlContent() ? sb.toString() : orig;
+    return isHtmlContent() ? content : orig;
    }
  
  }