src/jalview/util/ParseHtmlBodyAndLinks.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.util;
  22
  23 import java.util.ArrayList;
  24 import java.util.StringTokenizer;
  25 import java.util.regex.Pattern;
  26
  27 /**
  28  * utility class for dealing with HTML link extraction
  29  *
  30  * @author jprocter
  31  *
  32  */
  33 public class ParseHtmlBodyAndLinks
  34 {
  35   private static final Pattern LEFT_ANGLE_BRACKET_PATTERN = Pattern.compile("<");
  36
  37   String orig = null;
  38
  39   public String getOrig()
  40   {
  41     return orig;
  42   }
  43
  44   boolean htmlContent = true;
  45
  46   /**
  47    * @return true if the content looked like HTML
  48    */
  49   public boolean isHtmlContent()
  50   {
  51     return htmlContent;
  52   }
  53
  54   ArrayList<String> links = new ArrayList<String>();
  55
  56   StringBuffer sb = new StringBuffer();
  57
  58   /**
  59    * result of parsing description - with or without HTML tags
  60    *
  61    * @return
  62    */
  63   public String getContent()
  64   {
  65
  66     return sb.toString();
  67   }
  68
  69   /**
  70    * list of Label|Link encoded URL links extracted from HTML
  71    *
  72    * @return
  73    */
  74   public ArrayList<String> getLinks()
  75   {
  76     return links;
  77   }
  78
  79   /**
  80    *
  81    * @param description
  82    *          - html or text content to be parsed
  83    * @param removeHTML
  84    *          flag to indicate if HTML tags should be removed if they are
  85    *          present.
  86    * @param newline
  87    */
  88   public ParseHtmlBodyAndLinks(String description, boolean removeHTML,
  89           String newline)
  90   {
  91     if (description == null || description.length() == 0)
  92     {
  93       htmlContent = false;
  94       return;
  95     }
  96     if (description.toUpperCase().indexOf("<HTML>") == -1)
  97     {
  98       htmlContent = false;
  99     }
 100     orig = description;
 101     StringTokenizer st = new StringTokenizer(description, "<");
 102     String token, link;
 103     int startTag;
 104     String tag = null;
 105     while (st.hasMoreElements())
 106     {
 107       token = st.nextToken("&>");
 108       if (token.equalsIgnoreCase("html") || token.startsWith("/"))
 109       {
 110         continue;
 111       }
 112
 113       tag = null;
 114       startTag = token.indexOf("<");
 115
 116       if (startTag > -1)
 117       {
 118         tag = token.substring(startTag + 1);
 119         token = token.substring(0, startTag);
 120       }
 121
 122       if (tag != null && tag.toUpperCase().startsWith("A HREF="))
 123       {
 124         if (token.length() > 0)
 125         {
 126           sb.append(token);
 127         }
 128         link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);
 129         String label = st.nextToken("<>");
 130         links.add(label + "|" + link);
 131         sb.append(label + "%LINK%");
 132       }
 133       else if (tag != null && tag.equalsIgnoreCase("br"))
 134       {
 135         sb.append(newline);
 136       }
 137       else if (token.startsWith("lt;"))
 138       {
 139         sb.append("<" + token.substring(3));
 140       }
 141       else if (token.startsWith("gt;"))
 142       {
 143         sb.append(">" + token.substring(3));
 144       }
 145       else if (token.startsWith("amp;"))
 146       {
 147         sb.append("&" + token.substring(4));
 148       }
 149       else
 150       {
 151         sb.append(token);
 152       }
 153     }
 154     if (removeHTML && !htmlContent)
 155     {
 156       // instead of parsing the html into plaintext
 157       // clean the description ready for embedding in html
 158       sb = new StringBuffer(LEFT_ANGLE_BRACKET_PATTERN.matcher(description)
 159               .replaceAll("&lt;"));
 160
 161     }
 162
 163   }
 164
 165   /**
 166    * get either the parsed content or the original, depending on whether the
 167    * original looked like html content or not.
 168    *
 169    * @return
 170    */
 171   public String getNonHtmlContent()
 172   {
 173     return isHtmlContent() ? sb.toString() : orig;
 174   }
 175
 176 }