src/jalview/util/ParseHtmlBodyAndLinks.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.0b1)
   3  * Copyright (C) 2014 The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  10  *
  11  * Jalview is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty
  13  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  14  * PURPOSE.  See the GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  17  * The Jalview Authors are detailed in the 'AUTHORS' file.
  18  */
  19 package jalview.util;
  20
  21 import java.util.ArrayList;
  22 import java.util.StringTokenizer;
  23 import java.util.regex.Pattern;
  24
  25 /**
  26  * utility class for dealing with HTML link extraction
  27  *
  28  * @author jprocter
  29  *
  30  */
  31 public class ParseHtmlBodyAndLinks
  32 {
  33   String orig = null;
  34
  35   public String getOrig()
  36   {
  37     return orig;
  38   }
  39
  40   boolean htmlContent = true;
  41
  42   /**
  43    * @return true if the content looked like HTML
  44    */
  45   public boolean isHtmlContent()
  46   {
  47     return htmlContent;
  48   }
  49
  50   ArrayList<String> links = new ArrayList<String>();
  51
  52   StringBuffer sb = new StringBuffer();
  53
  54   /**
  55    * result of parsing description - with or without HTML tags
  56    *
  57    * @return
  58    */
  59   public String getContent()
  60   {
  61
  62     return sb.toString();
  63   }
  64
  65   /**
  66    * list of Label|Link encoded URL links extracted from HTML
  67    *
  68    * @return
  69    */
  70   public ArrayList<String> getLinks()
  71   {
  72     return links;
  73   }
  74
  75   /**
  76    *
  77    * @param description
  78    *          - html or text content to be parsed
  79    * @param removeHTML
  80    *          flag to indicate if HTML tags should be removed if they are
  81    *          present.
  82    * @param newline
  83    */
  84   public ParseHtmlBodyAndLinks(String description, boolean removeHTML,
  85           String newline)
  86   {
  87     if (description == null || description.length() == 0)
  88     {
  89       htmlContent = false;
  90       return;
  91     }
  92     if (description.toUpperCase().indexOf("<HTML>") == -1)
  93     {
  94       htmlContent = false;
  95     }
  96     orig = description;
  97     StringTokenizer st = new StringTokenizer(description, "<");
  98     String token, link;
  99     int startTag;
 100     String tag = null;
 101     while (st.hasMoreElements())
 102     {
 103       token = st.nextToken("&>");
 104       if (token.equalsIgnoreCase("html") || token.startsWith("/"))
 105       {
 106         continue;
 107       }
 108
 109       tag = null;
 110       startTag = token.indexOf("<");
 111
 112       if (startTag > -1)
 113       {
 114         tag = token.substring(startTag + 1);
 115         token = token.substring(0, startTag);
 116       }
 117
 118       if (tag != null && tag.toUpperCase().startsWith("A HREF="))
 119       {
 120         if (token.length() > 0)
 121         {
 122           sb.append(token);
 123         }
 124         link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);
 125         String label = st.nextToken("<>");
 126         links.add(label + "|" + link);
 127         sb.append(label + "%LINK%");
 128       }
 129       else if (tag != null && tag.equalsIgnoreCase("br"))
 130       {
 131         sb.append(newline);
 132       }
 133       else if (token.startsWith("lt;"))
 134       {
 135         sb.append("<" + token.substring(3));
 136       }
 137       else if (token.startsWith("gt;"))
 138       {
 139         sb.append(">" + token.substring(3));
 140       }
 141       else if (token.startsWith("amp;"))
 142       {
 143         sb.append("&" + token.substring(4));
 144       }
 145       else
 146       {
 147         sb.append(token);
 148       }
 149     }
 150     if (removeHTML && !htmlContent)
 151     {
 152       // instead of parsing the html into plaintext
 153       // clean the description ready for embedding in html
 154       sb = new StringBuffer(Pattern.compile("<").matcher(description)
 155               .replaceAll("&lt;"));
 156
 157     }
 158
 159   }
 160
 161   /**
 162    * get either the parsed content or the original, depending on whether the
 163    * original looked like html content or not.
 164    *
 165    * @return
 166    */
 167   public String getNonHtmlContent()
 168   {
 169     return isHtmlContent() ? sb.toString() : orig;
 170   }
 171
 172 }