src/jalview/util/ParseHtmlBodyAndLinks.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.util;
  22
  23 import java.util.ArrayList;
  24 import java.util.StringTokenizer;
  25 import java.util.regex.Pattern;
  26
  27 /**
  28  * utility class for dealing with HTML link extraction
  29  *
  30  * @author jprocter
  31  *
  32  */
  33 public class ParseHtmlBodyAndLinks
  34 {
  35   private static final Pattern LEFT_ANGLE_BRACKET_PATTERN = Pattern
  36           .compile("<");
  37
  38   String orig = null;
  39
  40   public String getOrig()
  41   {
  42     return orig;
  43   }
  44
  45   boolean htmlContent = true;
  46
  47   /**
  48    * @return true if the content looked like HTML
  49    */
  50   public boolean isHtmlContent()
  51   {
  52     return htmlContent;
  53   }
  54
  55   ArrayList<String> links = new ArrayList<String>();
  56
  57   StringBuffer sb = new StringBuffer();
  58
  59   /**
  60    * result of parsing description - with or without HTML tags
  61    *
  62    * @return
  63    */
  64   public String getContent()
  65   {
  66
  67     return sb.toString();
  68   }
  69
  70   /**
  71    * list of Label|Link encoded URL links extracted from HTML
  72    *
  73    * @return
  74    */
  75   public ArrayList<String> getLinks()
  76   {
  77     return links;
  78   }
  79
  80   /**
  81    *
  82    * @param description
  83    *          - html or text content to be parsed
  84    * @param removeHTML
  85    *          flag to indicate if HTML tags should be removed if they are
  86    *          present.
  87    * @param newline
  88    */
  89   public ParseHtmlBodyAndLinks(String description, boolean removeHTML,
  90           String newline)
  91   {
  92     if (description == null || description.length() == 0)
  93     {
  94       htmlContent = false;
  95       return;
  96     }
  97     if (description.toUpperCase().indexOf("<HTML>") == -1)
  98     {
  99       htmlContent = false;
 100     }
 101     orig = description;
 102     StringTokenizer st = new StringTokenizer(description, "<");
 103     String token, link;
 104     int startTag;
 105     String tag = null;
 106     while (st.hasMoreElements())
 107     {
 108       token = st.nextToken("&>");
 109       if (token.equalsIgnoreCase("html") || token.startsWith("/"))
 110       {
 111         continue;
 112       }
 113
 114       tag = null;
 115       startTag = token.indexOf("<");
 116
 117       if (startTag > -1)
 118       {
 119         tag = token.substring(startTag + 1);
 120         token = token.substring(0, startTag);
 121       }
 122
 123       if (tag != null && tag.toUpperCase().startsWith("A HREF="))
 124       {
 125         if (token.length() > 0)
 126         {
 127           sb.append(token);
 128         }
 129         link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);
 130         String label = st.nextToken("<>");
 131         links.add(label + "|" + link);
 132         sb.append(label + "%LINK%");
 133       }
 134       else if (tag != null && tag.equalsIgnoreCase("br"))
 135       {
 136         sb.append(newline);
 137       }
 138       else if (token.startsWith("lt;"))
 139       {
 140         sb.append("<" + token.substring(3));
 141       }
 142       else if (token.startsWith("gt;"))
 143       {
 144         sb.append(">" + token.substring(3));
 145       }
 146       else if (token.startsWith("amp;"))
 147       {
 148         sb.append("&" + token.substring(4));
 149       }
 150       else
 151       {
 152         sb.append(token);
 153       }
 154     }
 155     if (removeHTML && !htmlContent)
 156     {
 157       // instead of parsing the html into plaintext
 158       // clean the description ready for embedding in html
 159       sb = new StringBuffer(LEFT_ANGLE_BRACKET_PATTERN.matcher(description)
 160               .replaceAll("&lt;"));
 161
 162     }
 163
 164   }
 165
 166   /**
 167    * get either the parsed content or the original, depending on whether the
 168    * original looked like html content or not.
 169    *
 170    * @return
 171    */
 172   public String getNonHtmlContent()
 173   {
 174     return isHtmlContent() ? sb.toString() : orig;
 175   }
 176
 177 }