src/jalview/util/ParseHtmlBodyAndLinks.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8)
   3  * Copyright (C) 2012 J Procter, AM Waterhouse, LM Lui, J Engelhardt, G Barton, M Clamp, S Searle
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  10  *
  11  * Jalview is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty
  13  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  14  * PURPOSE.  See the GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  17  */
  18 package jalview.util;
  19
  20 import java.util.ArrayList;
  21 import java.util.StringTokenizer;
  22 import java.util.regex.Pattern;
  23
  24 /**
  25  * utility class for dealing with HTML link extraction
  26  *
  27  * @author jprocter
  28  *
  29  */
  30 public class ParseHtmlBodyAndLinks
  31 {
  32   String orig = null;
  33
  34   public String getOrig()
  35   {
  36     return orig;
  37   }
  38
  39   boolean htmlContent = true;
  40
  41   /**
  42    * @return true if the content looked like HTML
  43    */
  44   public boolean isHtmlContent()
  45   {
  46     return htmlContent;
  47   }
  48
  49   ArrayList<String> links = new ArrayList<String>();
  50
  51   StringBuffer sb = new StringBuffer();
  52
  53   /**
  54    * result of parsing description - with or without HTML tags
  55    *
  56    * @return
  57    */
  58   public String getContent()
  59   {
  60
  61     return sb.toString();
  62   }
  63
  64   /**
  65    * list of Label|Link encoded URL links extracted from HTML
  66    *
  67    * @return
  68    */
  69   public ArrayList<String> getLinks()
  70   {
  71     return links;
  72   }
  73
  74   /**
  75    *
  76    * @param description
  77    *          - html or text content to be parsed
  78    * @param removeHTML
  79    *          flag to indicate if HTML tags should be removed if they are
  80    *          present.
  81    * @param newline
  82    */
  83   public ParseHtmlBodyAndLinks(String description, boolean removeHTML,
  84           String newline)
  85   {
  86     if (description == null || description.length() == 0)
  87     {
  88       htmlContent = false;
  89       return;
  90     }
  91     if (description.toUpperCase().indexOf("<HTML>") == -1)
  92     {
  93       htmlContent = false;
  94     }
  95     orig = description;
  96     StringTokenizer st = new StringTokenizer(description, "<");
  97     String token, link;
  98     int startTag;
  99     String tag = null;
 100     while (st.hasMoreElements())
 101     {
 102       token = st.nextToken("&>");
 103       if (token.equalsIgnoreCase("html") || token.startsWith("/"))
 104       {
 105         continue;
 106       }
 107
 108       tag = null;
 109       startTag = token.indexOf("<");
 110
 111       if (startTag > -1)
 112       {
 113         tag = token.substring(startTag + 1);
 114         token = token.substring(0, startTag);
 115       }
 116
 117       if (tag != null && tag.toUpperCase().startsWith("A HREF="))
 118       {
 119         if (token.length() > 0)
 120         {
 121           sb.append(token);
 122         }
 123         link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);
 124         String label = st.nextToken("<>");
 125         links.add(label + "|" + link);
 126         sb.append(label + "%LINK%");
 127       }
 128       else if (tag != null && tag.equalsIgnoreCase("br"))
 129       {
 130         sb.append(newline);
 131       }
 132       else if (token.startsWith("lt;"))
 133       {
 134         sb.append("<" + token.substring(3));
 135       }
 136       else if (token.startsWith("gt;"))
 137       {
 138         sb.append(">" + token.substring(3));
 139       }
 140       else if (token.startsWith("amp;"))
 141       {
 142         sb.append("&" + token.substring(4));
 143       }
 144       else
 145       {
 146         sb.append(token);
 147       }
 148     }
 149     if (removeHTML && !htmlContent)
 150     {
 151       // instead of parsing the html into plaintext
 152       // clean the description ready for embedding in html
 153       sb = new StringBuffer(Pattern.compile("<").matcher(description)
 154               .replaceAll("&lt;"));
 155
 156     }
 157
 158   }
 159
 160   /**
 161    * get either the parsed content or the original, depending on whether the
 162    * original looked like html content or not.
 163    *
 164    * @return
 165    */
 166   public String getNonHtmlContent()
 167   {
 168     return isHtmlContent() ? sb.toString() : orig;
 169   }
 170
 171 }