src/jalview/util/ParseHtmlBodyAndLinks.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.util;
  22
  23 import java.util.Locale;
  24
  25 import java.util.ArrayList;
  26 import java.util.List;
  27 import java.util.StringTokenizer;
  28 import java.util.regex.Pattern;
  29
  30 /**
  31  * utility class for dealing with HTML link extraction
  32  *
  33  * @author jprocter
  34  *
  35  */
  36 public class ParseHtmlBodyAndLinks
  37 {
  38   private static final Pattern LEFT_ANGLE_BRACKET_PATTERN = Pattern
  39           .compile("<");
  40
  41   String orig = null;
  42
  43   public String getOrig()
  44   {
  45     return orig;
  46   }
  47
  48   boolean htmlContent = true;
  49
  50   /**
  51    * @return true if the content looked like HTML
  52    */
  53   public boolean isHtmlContent()
  54   {
  55     return htmlContent;
  56   }
  57
  58   List<String> links = new ArrayList<String>();
  59
  60   String content;
  61
  62   /**
  63    * result of parsing description - with or without HTML tags
  64    *
  65    * @return
  66    */
  67   public String getContent()
  68   {
  69
  70     return content;
  71   }
  72
  73   /**
  74    * list of Label|Link encoded URL links extracted from HTML
  75    *
  76    * @return
  77    */
  78   public List<String> getLinks()
  79   {
  80     return links;
  81   }
  82
  83   /**
  84    * Parses the given html and
  85    * <ul>
  86    * <li>extracts any 'href' links to a list of "displayName|url" strings,
  87    * retrievable by #getLinks</li>
  88    * <li>extracts the remaining text (with %LINK% placeholders replacing hrefs),
  89    * retrievable by #getContent</li>
  90    * </ul>
  91    *
  92    * @param description
  93    *          - html or text content to be parsed
  94    * @param removeHTML
  95    *          flag to indicate if HTML tags should be removed if they are
  96    *          present.
  97    * @param newline
  98    */
  99   public ParseHtmlBodyAndLinks(String description, boolean removeHTML,
 100           String newline)
 101   {
 102     if (description == null || description.length() == 0)
 103     {
 104       htmlContent = false;
 105       return;
 106     }
 107     StringBuilder sb = new StringBuilder(description.length());
 108     if (description.toUpperCase(Locale.ROOT).indexOf("<HTML>") == -1)
 109     {
 110       htmlContent = false;
 111     }
 112     orig = description;
 113     StringTokenizer st = new StringTokenizer(description, "<");
 114     String token, link;
 115     int startTag;
 116     String tag = null;
 117     while (st.hasMoreElements())
 118     {
 119       token = st.nextToken(">");
 120       if (token.equalsIgnoreCase("html") || token.startsWith("/"))
 121       {
 122         continue;
 123       }
 124
 125       tag = null;
 126       startTag = token.indexOf("<");
 127
 128       if (startTag > -1)
 129       {
 130         tag = token.substring(startTag + 1);
 131         token = token.substring(0, startTag);
 132       }
 133
 134       if (tag != null && tag.toUpperCase(Locale.ROOT).startsWith("A HREF="))
 135       {
 136         if (token.length() > 0)
 137         {
 138           sb.append(token);
 139         }
 140         link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);
 141         String label = st.nextToken("<>");
 142         links.add(label + "|" + link);
 143         sb.append(label + "%LINK%");
 144       }
 145       else if (tag != null && tag.equalsIgnoreCase("br"))
 146       {
 147         sb.append(newline);
 148       }
 149       else
 150       {
 151         sb.append(token);
 152       }
 153     }
 154     if (removeHTML && !htmlContent)
 155     {
 156       // instead of parsing the html into plaintext
 157       // clean the description ready for embedding in html
 158       sb = new StringBuilder(LEFT_ANGLE_BRACKET_PATTERN.matcher(description)
 159               .replaceAll("&lt;"));
 160     }
 161     content = translateEntities(sb.toString());
 162   }
 163
 164   private String translateEntities(String s)
 165   {
 166     s = s.replaceAll("&amp;", "&");
 167     s = s.replaceAll("&lt;", "<");
 168     s = s.replaceAll("&gt;", ">");
 169     return s;
 170   }
 171
 172   /**
 173    * get either the parsed content or the original, depending on whether the
 174    * original looked like html content or not.
 175    *
 176    * @return
 177    */
 178   public String getNonHtmlContent()
 179   {
 180     return isHtmlContent() ? content : orig;
 181   }
 182
 183 }