src/jalview/util/ParseHtmlBodyAndLinks.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.2)
   3  * Copyright (C) 2014 The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.util;
  22
  23 import java.util.ArrayList;
  24 import java.util.StringTokenizer;
  25 import java.util.regex.Pattern;
  26
  27 /**
  28  * utility class for dealing with HTML link extraction
  29  *
  30  * @author jprocter
  31  *
  32  */
  33 public class ParseHtmlBodyAndLinks
  34 {
  35   String orig = null;
  36
  37   public String getOrig()
  38   {
  39     return orig;
  40   }
  41
  42   boolean htmlContent = true;
  43
  44   /**
  45    * @return true if the content looked like HTML
  46    */
  47   public boolean isHtmlContent()
  48   {
  49     return htmlContent;
  50   }
  51
  52   ArrayList<String> links = new ArrayList<String>();
  53
  54   StringBuffer sb = new StringBuffer();
  55
  56   /**
  57    * result of parsing description - with or without HTML tags
  58    *
  59    * @return
  60    */
  61   public String getContent()
  62   {
  63
  64     return sb.toString();
  65   }
  66
  67   /**
  68    * list of Label|Link encoded URL links extracted from HTML
  69    *
  70    * @return
  71    */
  72   public ArrayList<String> getLinks()
  73   {
  74     return links;
  75   }
  76
  77   /**
  78    *
  79    * @param description
  80    *          - html or text content to be parsed
  81    * @param removeHTML
  82    *          flag to indicate if HTML tags should be removed if they are
  83    *          present.
  84    * @param newline
  85    */
  86   public ParseHtmlBodyAndLinks(String description, boolean removeHTML,
  87           String newline)
  88   {
  89     if (description == null || description.length() == 0)
  90     {
  91       htmlContent = false;
  92       return;
  93     }
  94     if (description.toUpperCase().indexOf("<HTML>") == -1)
  95     {
  96       htmlContent = false;
  97     }
  98     orig = description;
  99     StringTokenizer st = new StringTokenizer(description, "<");
 100     String token, link;
 101     int startTag;
 102     String tag = null;
 103     while (st.hasMoreElements())
 104     {
 105       token = st.nextToken("&>");
 106       if (token.equalsIgnoreCase("html") || token.startsWith("/"))
 107       {
 108         continue;
 109       }
 110
 111       tag = null;
 112       startTag = token.indexOf("<");
 113
 114       if (startTag > -1)
 115       {
 116         tag = token.substring(startTag + 1);
 117         token = token.substring(0, startTag);
 118       }
 119
 120       if (tag != null && tag.toUpperCase().startsWith("A HREF="))
 121       {
 122         if (token.length() > 0)
 123         {
 124           sb.append(token);
 125         }
 126         link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);
 127         String label = st.nextToken("<>");
 128         links.add(label + "|" + link);
 129         sb.append(label + "%LINK%");
 130       }
 131       else if (tag != null && tag.equalsIgnoreCase("br"))
 132       {
 133         sb.append(newline);
 134       }
 135       else if (token.startsWith("lt;"))
 136       {
 137         sb.append("<" + token.substring(3));
 138       }
 139       else if (token.startsWith("gt;"))
 140       {
 141         sb.append(">" + token.substring(3));
 142       }
 143       else if (token.startsWith("amp;"))
 144       {
 145         sb.append("&" + token.substring(4));
 146       }
 147       else
 148       {
 149         sb.append(token);
 150       }
 151     }
 152     if (removeHTML && !htmlContent)
 153     {
 154       // instead of parsing the html into plaintext
 155       // clean the description ready for embedding in html
 156       sb = new StringBuffer(Pattern.compile("<").matcher(description)
 157               .replaceAll("&lt;"));
 158
 159     }
 160
 161   }
 162
 163   /**
 164    * get either the parsed content or the original, depending on whether the
 165    * original looked like html content or not.
 166    *
 167    * @return
 168    */
 169   public String getNonHtmlContent()
 170   {
 171     return isHtmlContent() ? sb.toString() : orig;
 172   }
 173
 174 }