X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Futil%2FParseHtmlBodyAndLinks.java;h=0f2d01d0b7618405e1f51ad5da68d1f4a5993856;hb=41b0e9331ac71787c1280aa1d809f54c575fbf97;hp=5cb0a465fefa24e7c83744a44dc573962fd30519;hpb=193d05af3d6716a7e7652f208d41a016a0039396;p=jalview.git diff --git a/src/jalview/util/ParseHtmlBodyAndLinks.java b/src/jalview/util/ParseHtmlBodyAndLinks.java index 5cb0a46..0f2d01d 100644 --- a/src/jalview/util/ParseHtmlBodyAndLinks.java +++ b/src/jalview/util/ParseHtmlBodyAndLinks.java @@ -1,66 +1,111 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ package jalview.util; +import java.util.Locale; + import java.util.ArrayList; +import java.util.List; import java.util.StringTokenizer; import java.util.regex.Pattern; - /** * utility class for dealing with HTML link extraction + * * @author jprocter - * + * */ public class ParseHtmlBodyAndLinks +{ + private static final Pattern LEFT_ANGLE_BRACKET_PATTERN = Pattern + .compile("<"); + + String orig = null; + + public String getOrig() { - String orig=null; - public String getOrig() - { - return orig; - } - boolean htmlContent=true; - /** - * @return true if the content looked like HTML + return orig; + } - */ - public boolean isHtmlContent() - { - return htmlContent; - } + boolean htmlContent = true; - ArrayList links=new ArrayList(); - StringBuffer sb = new StringBuffer(); - /** - * result of parsing description - with or without HTML tags - * @return - */ - public String getContent() - { - - return sb.toString(); - } - /** - * list of Label|Link encoded URL links extracted from HTML - * @return - */ - public ArrayList getLinks() { - return links; - } + /** + * @return true if the content looked like HTML + */ + public boolean isHtmlContent() + { + return htmlContent; + } + + List links = new ArrayList(); + + String content; - /** - * - * @param description - html or text content to be parsed - * @param removeHTML flag to indicate if HTML tags should be removed if they are present. - * @param newline - */ - public ParseHtmlBodyAndLinks(String description, - boolean removeHTML, String newline) + /** + * result of parsing description - with or without HTML tags + * + * @return + */ + public String getContent() + { + + return content; + } + + /** + * list of Label|Link encoded URL links extracted from HTML + * + * @return + */ + public List getLinks() + { + return links; + } + + /** + * Parses the given html and + *
    + *
  • extracts any 'href' links to a list of "displayName|url" strings, + * retrievable by #getLinks
  • + *
  • extracts the remaining text (with %LINK% placeholders replacing hrefs), + * retrievable by #getContent
  • + *
+ * + * @param description + * - html or text content to be parsed + * @param removeHTML + * flag to indicate if HTML tags should be removed if they are + * present. + * @param newline + */ + public ParseHtmlBodyAndLinks(String description, boolean removeHTML, + String newline) + { + if (description == null || description.length() == 0) { - if (description==null || description.length()==0) - { - htmlContent=false; - return; - } - if (description.toUpperCase().indexOf("") == -1) + htmlContent = false; + return; + } + StringBuilder sb = new StringBuilder(description.length()); + if (description.toUpperCase(Locale.ROOT).indexOf("") == -1) { htmlContent = false; } @@ -71,7 +116,7 @@ public class ParseHtmlBodyAndLinks String tag = null; while (st.hasMoreElements()) { - token = st.nextToken("&>"); + token = st.nextToken(">"); if (token.equalsIgnoreCase("html") || token.startsWith("/")) { continue; @@ -86,7 +131,7 @@ public class ParseHtmlBodyAndLinks token = token.substring(0, startTag); } - if (tag != null && tag.toUpperCase().startsWith("A HREF=")) + if (tag != null && tag.toUpperCase(Locale.ROOT).startsWith("A HREF=")) { if (token.length() > 0) { @@ -101,18 +146,6 @@ public class ParseHtmlBodyAndLinks { sb.append(newline); } - else if (token.startsWith("lt;")) - { - sb.append("<" + token.substring(3)); - } - else if (token.startsWith("gt;")) - { - sb.append(">" + token.substring(3)); - } - else if (token.startsWith("amp;")) - { - sb.append("&" + token.substring(4)); - } else { sb.append(token); @@ -122,18 +155,29 @@ public class ParseHtmlBodyAndLinks { // instead of parsing the html into plaintext // clean the description ready for embedding in html - sb = new StringBuffer(Pattern.compile("<").matcher(description).replaceAll("<")); - + sb = new StringBuilder(LEFT_ANGLE_BRACKET_PATTERN.matcher(description) + .replaceAll("<")); } - + content = translateEntities(sb.toString()); + } + + private String translateEntities(String s) + { + s = s.replaceAll("&", "&"); + s = s.replaceAll("<", "<"); + s = s.replaceAll(">", ">"); + return s; + } + + /** + * get either the parsed content or the original, depending on whether the + * original looked like html content or not. + * + * @return + */ + public String getNonHtmlContent() + { + return isHtmlContent() ? content : orig; } - /** - * get either the parsed content or the original, depending on whether the original looked like html content or not. - * @return - */ - public String getNonHtmlContent() - { - return isHtmlContent() ? sb.toString() : orig; - } }