src/jalview/util/ParseHtmlBodyAndLinks.java

   1 package jalview.util;
   2
   3 import java.util.ArrayList;
   4 import java.util.StringTokenizer;
   5 import java.util.regex.Pattern;
   6
   7
   8 /**
   9  * utility class for dealing with HTML link extraction
  10  * @author jprocter
  11  *
  12  */
  13 public class ParseHtmlBodyAndLinks
  14   {
  15     String orig=null;
  16     public String getOrig()
  17     {
  18       return orig;
  19     }
  20     boolean htmlContent=true;
  21     /**
  22      * @return true if the content looked like HTML
  23
  24      */
  25     public boolean isHtmlContent()
  26     {
  27       return htmlContent;
  28     }
  29
  30     ArrayList<String> links=new ArrayList<String>();
  31     StringBuffer sb = new StringBuffer();
  32     /**
  33      * result of parsing description - with or without HTML tags
  34      * @return
  35      */
  36     public String getContent()
  37     {
  38
  39       return sb.toString();
  40     }
  41     /**
  42      * list of Label|Link encoded URL links extracted from HTML
  43      * @return
  44      */
  45     public ArrayList<String> getLinks() {
  46       return links;
  47     }
  48
  49     /**
  50      *
  51      * @param description - html or text content to be parsed
  52      * @param removeHTML flag to indicate if HTML tags should be removed if they are present.
  53      * @param newline
  54      */
  55     public ParseHtmlBodyAndLinks(String description,
  56           boolean removeHTML, String newline)
  57     {
  58       if (description==null || description.length()==0)
  59       {
  60         htmlContent=false;
  61         return;
  62       }
  63     if (description.toUpperCase().indexOf("<HTML>") == -1)
  64     {
  65       htmlContent = false;
  66     }
  67     orig = description;
  68     StringTokenizer st = new StringTokenizer(description, "<");
  69     String token, link;
  70     int startTag;
  71     String tag = null;
  72     while (st.hasMoreElements())
  73     {
  74       token = st.nextToken("&>");
  75       if (token.equalsIgnoreCase("html") || token.startsWith("/"))
  76       {
  77         continue;
  78       }
  79
  80       tag = null;
  81       startTag = token.indexOf("<");
  82
  83       if (startTag > -1)
  84       {
  85         tag = token.substring(startTag + 1);
  86         token = token.substring(0, startTag);
  87       }
  88
  89       if (tag != null && tag.toUpperCase().startsWith("A HREF="))
  90       {
  91         if (token.length() > 0)
  92         {
  93           sb.append(token);
  94         }
  95         link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);
  96         String label = st.nextToken("<>");
  97         links.add(label + "|" + link);
  98         sb.append(label + "%LINK%");
  99       }
 100       else if (tag != null && tag.equalsIgnoreCase("br"))
 101       {
 102         sb.append(newline);
 103       }
 104       else if (token.startsWith("lt;"))
 105       {
 106         sb.append("<" + token.substring(3));
 107       }
 108       else if (token.startsWith("gt;"))
 109       {
 110         sb.append(">" + token.substring(3));
 111       }
 112       else if (token.startsWith("amp;"))
 113       {
 114         sb.append("&" + token.substring(4));
 115       }
 116       else
 117       {
 118         sb.append(token);
 119       }
 120     }
 121     if (removeHTML && !htmlContent)
 122     {
 123       // instead of parsing the html into plaintext
 124       // clean the description ready for embedding in html
 125       sb = new StringBuffer(Pattern.compile("<").matcher(description).replaceAll("&lt;"));
 126
 127     }
 128
 129   }
 130     /**
 131      * get either the parsed content or the original, depending on whether the original looked like html content or not.
 132      * @return
 133      */
 134     public String getNonHtmlContent()
 135     {
 136       return isHtmlContent() ? sb.toString() : orig;
 137     }
 138
 139 }