3 import java.util.ArrayList;
4 import java.util.StringTokenizer;
5 import java.util.regex.Pattern;
9 * utility class for dealing with HTML link extraction
13 public class ParseHtmlBodyAndLinks
16 public String getOrig()
20 boolean htmlContent=true;
22 * @return true if the content looked like HTML
25 public boolean isHtmlContent()
30 ArrayList<String> links=new ArrayList<String>();
31 StringBuffer sb = new StringBuffer();
33 * result of parsing description - with or without HTML tags
36 public String getContent()
42 * list of Label|Link encoded URL links extracted from HTML
45 public ArrayList<String> getLinks() {
51 * @param description - html or text content to be parsed
52 * @param removeHTML flag to indicate if HTML tags should be removed if they are present.
55 public ParseHtmlBodyAndLinks(String description,
56 boolean removeHTML, String newline)
58 if (description==null || description.length()==0)
63 if (description.toUpperCase().indexOf("<HTML>") == -1)
68 StringTokenizer st = new StringTokenizer(description, "<");
72 while (st.hasMoreElements())
74 token = st.nextToken("&>");
75 if (token.equalsIgnoreCase("html") || token.startsWith("/"))
81 startTag = token.indexOf("<");
85 tag = token.substring(startTag + 1);
86 token = token.substring(0, startTag);
89 if (tag != null && tag.toUpperCase().startsWith("A HREF="))
91 if (token.length() > 0)
95 link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);
96 String label = st.nextToken("<>");
97 links.add(label + "|" + link);
98 sb.append(label + "%LINK%");
100 else if (tag != null && tag.equalsIgnoreCase("br"))
104 else if (token.startsWith("lt;"))
106 sb.append("<" + token.substring(3));
108 else if (token.startsWith("gt;"))
110 sb.append(">" + token.substring(3));
112 else if (token.startsWith("amp;"))
114 sb.append("&" + token.substring(4));
121 if (removeHTML && !htmlContent)
123 // instead of parsing the html into plaintext
124 // clean the description ready for embedding in html
125 sb = new StringBuffer(Pattern.compile("<").matcher(description).replaceAll("<"));
131 * get either the parsed content or the original, depending on whether the original looked like html content or not.
134 public String getNonHtmlContent()
136 return isHtmlContent() ? sb.toString() : orig;