src/jalview/util/StringUtils.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.util;
  22
  23 import java.io.UnsupportedEncodingException;
  24 import java.net.URLEncoder;
  25 import java.util.ArrayList;
  26 import java.util.List;
  27 import java.util.regex.Pattern;
  28
  29 public class StringUtils
  30 {
  31   private static final Pattern DELIMITERS_PATTERN = Pattern
  32           .compile(".*='[^']*(?!')");
  33
  34   private static final char PERCENT = '%';
  35
  36   private static final boolean DEBUG = false;
  37
  38   /*
  39    * URL encoded characters, indexed by char value
  40    * e.g. urlEncodings['='] = urlEncodings[61] = "%3D"
  41    */
  42   private static String[] urlEncodings = new String[255];
  43
  44   /**
  45    * Returns a new character array, after inserting characters into the given
  46    * character array.
  47    *
  48    * @param in
  49    *          the character array to insert into
  50    * @param position
  51    *          the 0-based position for insertion
  52    * @param count
  53    *          the number of characters to insert
  54    * @param ch
  55    *          the character to insert
  56    */
  57   public static final char[] insertCharAt(char[] in, int position,
  58           int count, char ch)
  59   {
  60     char[] tmp = new char[in.length + count];
  61
  62     if (position >= in.length)
  63     {
  64       System.arraycopy(in, 0, tmp, 0, in.length);
  65       position = in.length;
  66     }
  67     else
  68     {
  69       System.arraycopy(in, 0, tmp, 0, position);
  70     }
  71
  72     int index = position;
  73     while (count > 0)
  74     {
  75       tmp[index++] = ch;
  76       count--;
  77     }
  78
  79     if (position < in.length)
  80     {
  81       System.arraycopy(in, position, tmp, index, in.length - position);
  82     }
  83
  84     return tmp;
  85   }
  86
  87   /**
  88    * Delete
  89    *
  90    * @param in
  91    * @param from
  92    * @param to
  93    * @return
  94    */
  95   public static final char[] deleteChars(char[] in, int from, int to)
  96   {
  97     if (from >= in.length || from < 0)
  98     {
  99       return in;
 100     }
 101
 102     char[] tmp;
 103
 104     if (to >= in.length)
 105     {
 106       tmp = new char[from];
 107       System.arraycopy(in, 0, tmp, 0, from);
 108       to = in.length;
 109     }
 110     else
 111     {
 112       tmp = new char[in.length - to + from];
 113       System.arraycopy(in, 0, tmp, 0, from);
 114       System.arraycopy(in, to, tmp, from, in.length - to);
 115     }
 116     return tmp;
 117   }
 118
 119   /**
 120    * Returns the last part of 'input' after the last occurrence of 'token'. For
 121    * example to extract only the filename from a full path or URL.
 122    *
 123    * @param input
 124    * @param token
 125    *          a delimiter which must be in regular expression format
 126    * @return
 127    */
 128   public static String getLastToken(String input, String token)
 129   {
 130     if (input == null)
 131     {
 132       return null;
 133     }
 134     if (token == null)
 135     {
 136       return input;
 137     }
 138     String[] st = input.split(token);
 139     return st[st.length - 1];
 140   }
 141
 142   /**
 143    * Parses the input string into components separated by the delimiter. Unlike
 144    * String.split(), this method will ignore occurrences of the delimiter which
 145    * are nested within single quotes in name-value pair values, e.g. a='b,c'.
 146    *
 147    * @param input
 148    * @param delimiter
 149    * @return elements separated by separator
 150    */
 151   public static String[] separatorListToArray(String input,
 152           String delimiter)
 153   {
 154     int seplen = delimiter.length();
 155     if (input == null || input.equals("") || input.equals(delimiter))
 156     {
 157       return null;
 158     }
 159     List<String> jv = new ArrayList<>();
 160     int cp = 0, pos, escape;
 161     boolean wasescaped = false, wasquoted = false;
 162     String lstitem = null;
 163     while ((pos = input.indexOf(delimiter, cp)) >= cp)
 164     {
 165       escape = (pos > 0 && input.charAt(pos - 1) == '\\') ? -1 : 0;
 166       if (wasescaped || wasquoted)
 167       {
 168         // append to previous pos
 169         jv.set(jv.size() - 1, lstitem = lstitem + delimiter
 170                 + input.substring(cp, pos + escape));
 171       }
 172       else
 173       {
 174         jv.add(lstitem = input.substring(cp, pos + escape));
 175       }
 176       cp = pos + seplen;
 177       wasescaped = escape == -1;
 178       // last separator may be in an unmatched quote
 179       wasquoted = DELIMITERS_PATTERN.matcher(lstitem).matches();
 180     }
 181     if (cp < input.length())
 182     {
 183       String c = input.substring(cp);
 184       if (wasescaped || wasquoted)
 185       {
 186         // append final separator
 187         jv.set(jv.size() - 1, lstitem + delimiter + c);
 188       }
 189       else
 190       {
 191         if (!c.equals(delimiter))
 192         {
 193           jv.add(c);
 194         }
 195       }
 196     }
 197     if (jv.size() > 0)
 198     {
 199       String[] v = jv.toArray(new String[jv.size()]);
 200       jv.clear();
 201       if (DEBUG)
 202       {
 203         System.err.println("Array from '" + delimiter
 204                 + "' separated List:\n" + v.length);
 205         for (int i = 0; i < v.length; i++)
 206         {
 207           System.err.println("item " + i + " '" + v[i] + "'");
 208         }
 209       }
 210       return v;
 211     }
 212     if (DEBUG)
 213     {
 214       System.err.println(
 215               "Empty Array from '" + delimiter + "' separated List");
 216     }
 217     return null;
 218   }
 219
 220   /**
 221    * Returns a string which contains the list elements delimited by the
 222    * separator. Null items are ignored. If the input is null or has length zero,
 223    * a single delimiter is returned.
 224    *
 225    * @param list
 226    * @param separator
 227    * @return concatenated string
 228    */
 229   public static String arrayToSeparatorList(String[] list, String separator)
 230   {
 231     StringBuffer v = new StringBuffer();
 232     if (list != null && list.length > 0)
 233     {
 234       for (int i = 0, iSize = list.length; i < iSize; i++)
 235       {
 236         if (list[i] != null)
 237         {
 238           if (v.length() > 0)
 239           {
 240             v.append(separator);
 241           }
 242           // TODO - escape any separator values in list[i]
 243           v.append(list[i]);
 244         }
 245       }
 246       if (DEBUG)
 247       {
 248         System.err
 249                 .println("Returning '" + separator + "' separated List:\n");
 250         System.err.println(v);
 251       }
 252       return v.toString();
 253     }
 254     if (DEBUG)
 255     {
 256       System.err.println(
 257               "Returning empty '" + separator + "' separated List\n");
 258     }
 259     return "" + separator;
 260   }
 261
 262   /**
 263    * Converts a list to a string with a delimiter before each term except the
 264    * first. Returns an empty string given a null or zero-length argument. This
 265    * can be replaced with StringJoiner in Java 8.
 266    *
 267    * @param terms
 268    * @param delim
 269    * @return
 270    */
 271   public static String listToDelimitedString(List<String> terms,
 272           String delim)
 273   {
 274     StringBuilder sb = new StringBuilder(32);
 275     if (terms != null && !terms.isEmpty())
 276     {
 277       boolean appended = false;
 278       for (String term : terms)
 279       {
 280         if (appended)
 281         {
 282           sb.append(delim);
 283         }
 284         appended = true;
 285         sb.append(term);
 286       }
 287     }
 288     return sb.toString();
 289   }
 290
 291   /**
 292    * Convenience method to parse a string to an integer, returning 0 if the
 293    * input is null or not a valid integer
 294    *
 295    * @param s
 296    * @return
 297    */
 298   public static int parseInt(String s)
 299   {
 300     int result = 0;
 301     if (s != null && s.length() > 0)
 302     {
 303       try
 304       {
 305         result = Integer.parseInt(s);
 306       } catch (NumberFormatException ex)
 307       {
 308       }
 309     }
 310     return result;
 311   }
 312
 313   /**
 314    * Compares two versions formatted as e.g. "3.4.5" and returns -1, 0 or 1 as
 315    * the first version precedes, is equal to, or follows the second
 316    *
 317    * @param v1
 318    * @param v2
 319    * @return
 320    */
 321   public static int compareVersions(String v1, String v2)
 322   {
 323     return compareVersions(v1, v2, null);
 324   }
 325
 326   /**
 327    * Compares two versions formatted as e.g. "3.4.5b1" and returns -1, 0 or 1 as
 328    * the first version precedes, is equal to, or follows the second
 329    *
 330    * @param v1
 331    * @param v2
 332    * @param pointSeparator
 333    *          a string used to delimit point increments in sub-tokens of the
 334    *          version
 335    * @return
 336    */
 337   public static int compareVersions(String v1, String v2,
 338           String pointSeparator)
 339   {
 340     if (v1 == null || v2 == null)
 341     {
 342       return 0;
 343     }
 344     String[] toks1 = v1.split("\\.");
 345     String[] toks2 = v2.split("\\.");
 346     int i = 0;
 347     for (; i < toks1.length; i++)
 348     {
 349       if (i >= toks2.length)
 350       {
 351         /*
 352          * extra tokens in v1
 353          */
 354         return 1;
 355       }
 356       String tok1 = toks1[i];
 357       String tok2 = toks2[i];
 358       if (pointSeparator != null)
 359       {
 360         /*
 361          * convert e.g. 5b2 into decimal 5.2 for comparison purposes
 362          */
 363         tok1 = tok1.replace(pointSeparator, ".");
 364         tok2 = tok2.replace(pointSeparator, ".");
 365       }
 366       try
 367       {
 368         float f1 = Float.valueOf(tok1);
 369         float f2 = Float.valueOf(tok2);
 370         int comp = Float.compare(f1, f2);
 371         if (comp != 0)
 372         {
 373           return comp;
 374         }
 375       } catch (NumberFormatException e)
 376       {
 377         System.err
 378                 .println("Invalid version format found: " + e.getMessage());
 379         return 0;
 380       }
 381     }
 382
 383     if (i < toks2.length)
 384     {
 385       /*
 386        * extra tokens in v2
 387        */
 388       return -1;
 389     }
 390
 391     /*
 392      * same length, all tokens match
 393      */
 394     return 0;
 395   }
 396
 397   /**
 398    * Converts the string to all lower-case except the first character which is
 399    * upper-cased
 400    *
 401    * @param s
 402    * @return
 403    */
 404   public static String toSentenceCase(String s)
 405   {
 406     if (s == null)
 407     {
 408       return s;
 409     }
 410     if (s.length() <= 1)
 411     {
 412       return s.toUpperCase();
 413     }
 414     return s.substring(0, 1).toUpperCase() + s.substring(1).toLowerCase();
 415   }
 416
 417   /**
 418    * A helper method that strips off any leading or trailing html and body tags.
 419    * If no html tag is found, then also html-encodes angle bracket characters.
 420    *
 421    * @param text
 422    * @return
 423    */
 424   public static String stripHtmlTags(String text)
 425   {
 426     if (text == null)
 427     {
 428       return null;
 429     }
 430     String tmp2up = text.toUpperCase();
 431     int startTag = tmp2up.indexOf("<HTML>");
 432     if (startTag > -1)
 433     {
 434       text = text.substring(startTag + 6);
 435       tmp2up = tmp2up.substring(startTag + 6);
 436     }
 437     // is omission of "<BODY>" intentional here??
 438     int endTag = tmp2up.indexOf("</BODY>");
 439     if (endTag > -1)
 440     {
 441       text = text.substring(0, endTag);
 442       tmp2up = tmp2up.substring(0, endTag);
 443     }
 444     endTag = tmp2up.indexOf("</HTML>");
 445     if (endTag > -1)
 446     {
 447       text = text.substring(0, endTag);
 448     }
 449
 450     if (startTag == -1 && (text.contains("<") || text.contains(">")))
 451     {
 452       text = text.replaceAll("<", "&lt;");
 453       text = text.replaceAll(">", "&gt;");
 454     }
 455     return text;
 456   }
 457
 458   /**
 459    * Answers the input string with any occurrences of the 'encodeable' characters
 460    * replaced by their URL encoding
 461    *
 462    * @param s
 463    * @param encodable
 464    * @return
 465    */
 466   public static String urlEncode(String s, String encodable)
 467   {
 468     if (s == null || s.isEmpty())
 469     {
 470       return s;
 471     }
 472
 473     /*
 474      * do % encoding first, as otherwise it may double-encode!
 475      */
 476     if (encodable.indexOf(PERCENT) != -1)
 477     {
 478       s = urlEncode(s, PERCENT);
 479     }
 480
 481     for (char c : encodable.toCharArray())
 482     {
 483       if (c != PERCENT)
 484       {
 485         s = urlEncode(s, c);
 486       }
 487     }
 488     return s;
 489   }
 490
 491   /**
 492    * Answers the input string with any occurrences of {@code c} replaced with
 493    * their url encoding. Answers the input string if it is unchanged.
 494    *
 495    * @param s
 496    * @param c
 497    * @return
 498    */
 499   static String urlEncode(String s, char c)
 500   {
 501     String decoded = String.valueOf(c);
 502     if (s.indexOf(decoded) != -1)
 503     {
 504       String encoded = getUrlEncoding(c);
 505       if (!encoded.equals(decoded))
 506       {
 507         s = s.replace(decoded, encoded);
 508       }
 509     }
 510     return s;
 511   }
 512
 513   /**
 514    * Answers the input string with any occurrences of the specified (unencoded)
 515    * characters replaced by their URL decoding.
 516    * <p>
 517    * Example: {@code urlDecode("a%3Db%3Bc", "-;=,")} should answer
 518    * {@code "a=b;c"}.
 519    *
 520    * @param s
 521    * @param encodable
 522    * @return
 523    */
 524   public static String urlDecode(String s, String encodable)
 525   {
 526     if (s == null || s.isEmpty())
 527     {
 528       return s;
 529     }
 530
 531     for (char c : encodable.toCharArray())
 532     {
 533       String encoded = getUrlEncoding(c);
 534       if (s.indexOf(encoded) != -1)
 535       {
 536         String decoded = String.valueOf(c);
 537         s = s.replace(encoded, decoded);
 538       }
 539     }
 540     return s;
 541   }
 542
 543   /**
 544    * Does a lazy lookup of the url encoding of the given character, saving the
 545    * value for repeat lookups
 546    *
 547    * @param c
 548    * @return
 549    */
 550   private static String getUrlEncoding(char c)
 551   {
 552     if (c < 0 || c >= urlEncodings.length)
 553     {
 554       return String.valueOf(c);
 555     }
 556
 557     String enc = urlEncodings[c];
 558     if (enc == null)
 559     {
 560       try
 561       {
 562         enc = urlEncodings[c] = URLEncoder.encode(String.valueOf(c),
 563                 "UTF-8");
 564       } catch (UnsupportedEncodingException e)
 565       {
 566         enc = urlEncodings[c] = String.valueOf(c);
 567       }
 568     }
 569     return enc;
 570   }
 571 }