src/jalview/util/ParseHtmlBodyAndLinks.java

   1 /*******************************************************************************
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.7)
   3  * Copyright (C) 2011 J Procter, AM Waterhouse, J Engelhardt, LM Lui, G Barton, M Clamp, S Searle
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  10  *
  11  * Jalview is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty
  13  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  14  * PURPOSE.  See the GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  17  *******************************************************************************/
  18 package jalview.util;
  19
  20 import java.util.ArrayList;
  21 import java.util.StringTokenizer;
  22 import java.util.regex.Pattern;
  23
  24
  25 /**
  26  * utility class for dealing with HTML link extraction
  27  * @author jprocter
  28  *
  29  */
  30 public class ParseHtmlBodyAndLinks
  31   {
  32     String orig=null;
  33     public String getOrig()
  34     {
  35       return orig;
  36     }
  37     boolean htmlContent=true;
  38     /**
  39      * @return true if the content looked like HTML
  40
  41      */
  42     public boolean isHtmlContent()
  43     {
  44       return htmlContent;
  45     }
  46
  47     ArrayList<String> links=new ArrayList<String>();
  48     StringBuffer sb = new StringBuffer();
  49     /**
  50      * result of parsing description - with or without HTML tags
  51      * @return
  52      */
  53     public String getContent()
  54     {
  55
  56       return sb.toString();
  57     }
  58     /**
  59      * list of Label|Link encoded URL links extracted from HTML
  60      * @return
  61      */
  62     public ArrayList<String> getLinks() {
  63       return links;
  64     }
  65
  66     /**
  67      *
  68      * @param description - html or text content to be parsed
  69      * @param removeHTML flag to indicate if HTML tags should be removed if they are present.
  70      * @param newline
  71      */
  72     public ParseHtmlBodyAndLinks(String description,
  73           boolean removeHTML, String newline)
  74     {
  75       if (description==null || description.length()==0)
  76       {
  77         htmlContent=false;
  78         return;
  79       }
  80     if (description.toUpperCase().indexOf("<HTML>") == -1)
  81     {
  82       htmlContent = false;
  83     }
  84     orig = description;
  85     StringTokenizer st = new StringTokenizer(description, "<");
  86     String token, link;
  87     int startTag;
  88     String tag = null;
  89     while (st.hasMoreElements())
  90     {
  91       token = st.nextToken("&>");
  92       if (token.equalsIgnoreCase("html") || token.startsWith("/"))
  93       {
  94         continue;
  95       }
  96
  97       tag = null;
  98       startTag = token.indexOf("<");
  99
 100       if (startTag > -1)
 101       {
 102         tag = token.substring(startTag + 1);
 103         token = token.substring(0, startTag);
 104       }
 105
 106       if (tag != null && tag.toUpperCase().startsWith("A HREF="))
 107       {
 108         if (token.length() > 0)
 109         {
 110           sb.append(token);
 111         }
 112         link = tag.substring(tag.indexOf("\"") + 1, tag.length() - 1);
 113         String label = st.nextToken("<>");
 114         links.add(label + "|" + link);
 115         sb.append(label + "%LINK%");
 116       }
 117       else if (tag != null && tag.equalsIgnoreCase("br"))
 118       {
 119         sb.append(newline);
 120       }
 121       else if (token.startsWith("lt;"))
 122       {
 123         sb.append("<" + token.substring(3));
 124       }
 125       else if (token.startsWith("gt;"))
 126       {
 127         sb.append(">" + token.substring(3));
 128       }
 129       else if (token.startsWith("amp;"))
 130       {
 131         sb.append("&" + token.substring(4));
 132       }
 133       else
 134       {
 135         sb.append(token);
 136       }
 137     }
 138     if (removeHTML && !htmlContent)
 139     {
 140       // instead of parsing the html into plaintext
 141       // clean the description ready for embedding in html
 142       sb = new StringBuffer(Pattern.compile("<").matcher(description).replaceAll("&lt;"));
 143
 144     }
 145
 146   }
 147     /**
 148      * get either the parsed content or the original, depending on whether the original looked like html content or not.
 149      * @return
 150      */
 151     public String getNonHtmlContent()
 152     {
 153       return isHtmlContent() ? sb.toString() : orig;
 154     }
 155
 156 }