From: jprocter Date: Fri, 19 Aug 2011 10:03:02 +0000 (+0100) Subject: JAL-908 JAL-701 refactored HTML escape/unescape code for sequence feature descriptions X-Git-Tag: Release_2_7~93 X-Git-Url: http://source.jalview.org/gitweb/?p=jalview.git;a=commitdiff_plain;h=193d05af3d6716a7e7652f208d41a016a0039396 JAL-908 JAL-701 refactored HTML escape/unescape code for sequence feature descriptions --- diff --git a/src/jalview/io/FeaturesFile.java b/src/jalview/io/FeaturesFile.java index 34253cb..c3640bf 100755 --- a/src/jalview/io/FeaturesFile.java +++ b/src/jalview/io/FeaturesFile.java @@ -633,71 +633,12 @@ public class FeaturesFile extends AlignFile { return; } + jalview.util.ParseHtmlBodyAndLinks parsed = new jalview.util.ParseHtmlBodyAndLinks(sf.getDescription(), removeHTML, newline); - if (removeHTML - && sf.getDescription().toUpperCase().indexOf("") == -1) + sf.description = (removeHTML) ? parsed.getNonHtmlContent() : sf.description; + for (String link:parsed.getLinks()) { - removeHTML = false; - } - - StringBuffer sb = new StringBuffer(); - StringTokenizer st = new StringTokenizer(sf.getDescription(), "<"); - String token, link; - int startTag; - String tag = null; - while (st.hasMoreElements()) - { - token = st.nextToken("&>"); - if (token.equalsIgnoreCase("html") || token.startsWith("/")) - { - continue; - } - - tag = null; - startTag = token.indexOf("<"); - - if (startTag > -1) - { - tag = token.substring(startTag + 1); - token = token.substring(0, startTag); - } - - if (tag != null && tag.toUpperCase().startsWith("A HREF=")) - { - if (token.length() > 0) - { - sb.append(token); - } - link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1); - String label = st.nextToken("<>"); - sf.addLink(label + "|" + link); - sb.append(label + "%LINK%"); - } - else if (tag != null && tag.equalsIgnoreCase("br")) - { - sb.append(newline); - } - else if (token.startsWith("lt;")) - { - sb.append("<" + token.substring(3)); - } - else if (token.startsWith("gt;")) - { - sb.append(">" + token.substring(3)); - } - else if (token.startsWith("amp;")) - { - sb.append("&" + token.substring(4)); - } - else - { - sb.append(token); - } - } - - if (removeHTML) - { - sf.description = sb.toString(); + sf.addLink(link); } } diff --git a/src/jalview/util/ParseHtmlBodyAndLinks.java b/src/jalview/util/ParseHtmlBodyAndLinks.java new file mode 100644 index 0000000..5cb0a46 --- /dev/null +++ b/src/jalview/util/ParseHtmlBodyAndLinks.java @@ -0,0 +1,139 @@ +package jalview.util; + +import java.util.ArrayList; +import java.util.StringTokenizer; +import java.util.regex.Pattern; + + +/** + * utility class for dealing with HTML link extraction + * @author jprocter + * + */ +public class ParseHtmlBodyAndLinks + { + String orig=null; + public String getOrig() + { + return orig; + } + boolean htmlContent=true; + /** + * @return true if the content looked like HTML + + */ + public boolean isHtmlContent() + { + return htmlContent; + } + + ArrayList links=new ArrayList(); + StringBuffer sb = new StringBuffer(); + /** + * result of parsing description - with or without HTML tags + * @return + */ + public String getContent() + { + + return sb.toString(); + } + /** + * list of Label|Link encoded URL links extracted from HTML + * @return + */ + public ArrayList getLinks() { + return links; + } + + /** + * + * @param description - html or text content to be parsed + * @param removeHTML flag to indicate if HTML tags should be removed if they are present. + * @param newline + */ + public ParseHtmlBodyAndLinks(String description, + boolean removeHTML, String newline) + { + if (description==null || description.length()==0) + { + htmlContent=false; + return; + } + if (description.toUpperCase().indexOf("") == -1) + { + htmlContent = false; + } + orig = description; + StringTokenizer st = new StringTokenizer(description, "<"); + String token, link; + int startTag; + String tag = null; + while (st.hasMoreElements()) + { + token = st.nextToken("&>"); + if (token.equalsIgnoreCase("html") || token.startsWith("/")) + { + continue; + } + + tag = null; + startTag = token.indexOf("<"); + + if (startTag > -1) + { + tag = token.substring(startTag + 1); + token = token.substring(0, startTag); + } + + if (tag != null && tag.toUpperCase().startsWith("A HREF=")) + { + if (token.length() > 0) + { + sb.append(token); + } + link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1); + String label = st.nextToken("<>"); + links.add(label + "|" + link); + sb.append(label + "%LINK%"); + } + else if (tag != null && tag.equalsIgnoreCase("br")) + { + sb.append(newline); + } + else if (token.startsWith("lt;")) + { + sb.append("<" + token.substring(3)); + } + else if (token.startsWith("gt;")) + { + sb.append(">" + token.substring(3)); + } + else if (token.startsWith("amp;")) + { + sb.append("&" + token.substring(4)); + } + else + { + sb.append(token); + } + } + if (removeHTML && !htmlContent) + { + // instead of parsing the html into plaintext + // clean the description ready for embedding in html + sb = new StringBuffer(Pattern.compile("<").matcher(description).replaceAll("<")); + + } + + } + /** + * get either the parsed content or the original, depending on whether the original looked like html content or not. + * @return + */ + public String getNonHtmlContent() + { + return isHtmlContent() ? sb.toString() : orig; + } + +}