ParseHtmlBodyAndLinks.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.util;
  22
  23 import java.util.ArrayList;
  24 import java.util.List;
  25 import java.util.StringTokenizer;
  26 import java.util.regex.Pattern;
  27
  28 /**
  29  * utility class for dealing with HTML link extraction
  30  *
  31  * @author jprocter
  32  *
  33  */
  34 public class ParseHtmlBodyAndLinks
  35 {
  36   private static final Pattern LEFT_ANGLE_BRACKET_PATTERN = Pattern
  37           .compile("<");
  38
  39   String orig = null;
  40
  41   public String getOrig()
  42   {
  43     return orig;
  44   }
  45
  46   boolean htmlContent = true;
  47
  48   /**
  49    * @return true if the content looked like HTML
  50    */
  51   public boolean isHtmlContent()
  52   {
  53     return htmlContent;
  54   }
  55
  56   List<String> links = new ArrayList<String>();
  57
  58   String content;
  59
  60   /**
  61    * result of parsing description - with or without HTML tags
  62    *
  63    * @return
  64    */
  65   public String getContent()
  66   {
  67
  68     return content;
  69   }
  70
  71   /**
  72    * list of Label|Link encoded URL links extracted from HTML
  73    *
  74    * @return
  75    */
  76   public List<String> getLinks()
  77   {
  78     return links;
  79   }
  80
  81   /**
  82    * Parses the given html and
  83    * <ul>
  84    * <li>extracts any 'href' links to a list of "displayName|url" strings,
  85    * retrievable by #getLinks</li>
  86    * <li>extracts the remaining text (with %LINK% placeholders replacing hrefs),
  87    * retrievable by #getContent</li>
  88    * </ul>
  89    *
  90    * @param description
  91    *          - html or text content to be parsed
  92    * @param removeHTML
  93    *          flag to indicate if HTML tags should be removed if they are
  94    *          present.
  95    * @param newline
  96    */
  97   public ParseHtmlBodyAndLinks(String description, boolean removeHTML,
  98           String newline)
  99   {
 100     StringBuilder sb = new StringBuilder(description.length());
 101     if (description == null || description.length() == 0)
 102     {
 103       htmlContent = false;
 104       return;
 105     }
 106     if (description.toUpperCase().indexOf("<HTML>") == -1)
 107     {
 108       htmlContent = false;
 109     }
 110     orig = description;
 111     StringTokenizer st = new StringTokenizer(description, "<");
 112     String token, link;
 113     int startTag;
 114     String tag = null;
 115     while (st.hasMoreElements())
 116     {
 117       token = st.nextToken(">");
 118       if (token.equalsIgnoreCase("html") || token.startsWith("/"))
 119       {
 120         continue;
 121       }
 122
 123       tag = null;
 124       startTag = token.indexOf("<");
 125
 126       if (startTag > -1)
 127       {
 128         tag = token.substring(startTag + 1);
 129         token = token.substring(0, startTag);
 130       }
 131
 132       if (tag != null && tag.toUpperCase().startsWith("A HREF="))
 133       {
 134         if (token.length() > 0)
 135         {
 136           sb.append(token);
 137         }
 138         link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);
 139         String label = st.nextToken("<>");
 140         links.add(label + "|" + link);
 141         sb.append(label + "%LINK%");
 142       }
 143       else if (tag != null && tag.equalsIgnoreCase("br"))
 144       {
 145         sb.append(newline);
 146       }
 147       else
 148       {
 149         sb.append(token);
 150       }
 151     }
 152     if (removeHTML && !htmlContent)
 153     {
 154       // instead of parsing the html into plaintext
 155       // clean the description ready for embedding in html
 156       sb = new StringBuilder(LEFT_ANGLE_BRACKET_PATTERN
 157               .matcher(description).replaceAll("&lt;"));
 158     }
 159     content = translateEntities(sb.toString());
 160   }
 161
 162   private String translateEntities(String s)
 163   {
 164     s = s.replaceAll("&amp;", "&");
 165     s = s.replaceAll("&lt;", "<");
 166     s = s.replaceAll("&gt;", ">");
 167     return s;
 168   }
 169
 170   /**
 171    * get either the parsed content or the original, depending on whether the
 172    * original looked like html content or not.
 173    *
 174    * @return
 175    */
 176   public String getNonHtmlContent()
 177   {
 178     return isHtmlContent() ? content : orig;
 179   }
 180
 181 }