src/jalview/util/StringUtils.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.util;
  22
  23 import java.io.UnsupportedEncodingException;
  24 import java.net.URLEncoder;
  25 import java.util.ArrayList;
  26 import java.util.List;
  27 import java.util.Locale;
  28 import java.util.regex.Matcher;
  29 import java.util.regex.Pattern;
  30
  31 public class StringUtils
  32 {
  33   private static final Pattern DELIMITERS_PATTERN = Pattern
  34           .compile(".*='[^']*(?!')");
  35
  36   private static final char PERCENT = '%';
  37
  38   private static final boolean DEBUG = false;
  39
  40   /*
  41    * URL encoded characters, indexed by char value
  42    * e.g. urlEncodings['='] = urlEncodings[61] = "%3D"
  43    */
  44   private static String[] urlEncodings = new String[255];
  45
  46   /**
  47    * Returns a new character array, after inserting characters into the given
  48    * character array.
  49    *
  50    * @param in
  51    *          the character array to insert into
  52    * @param position
  53    *          the 0-based position for insertion
  54    * @param count
  55    *          the number of characters to insert
  56    * @param ch
  57    *          the character to insert
  58    */
  59   public static final char[] insertCharAt(char[] in, int position,
  60           int count, char ch)
  61   {
  62     char[] tmp = new char[in.length + count];
  63
  64     if (position >= in.length)
  65     {
  66       System.arraycopy(in, 0, tmp, 0, in.length);
  67       position = in.length;
  68     }
  69     else
  70     {
  71       System.arraycopy(in, 0, tmp, 0, position);
  72     }
  73
  74     int index = position;
  75     while (count > 0)
  76     {
  77       tmp[index++] = ch;
  78       count--;
  79     }
  80
  81     if (position < in.length)
  82     {
  83       System.arraycopy(in, position, tmp, index, in.length - position);
  84     }
  85
  86     return tmp;
  87   }
  88
  89   /**
  90    * Delete
  91    *
  92    * @param in
  93    * @param from
  94    * @param to
  95    * @return
  96    */
  97   public static final char[] deleteChars(char[] in, int from, int to)
  98   {
  99     if (from >= in.length || from < 0)
 100     {
 101       return in;
 102     }
 103
 104     char[] tmp;
 105
 106     if (to >= in.length)
 107     {
 108       tmp = new char[from];
 109       System.arraycopy(in, 0, tmp, 0, from);
 110       to = in.length;
 111     }
 112     else
 113     {
 114       tmp = new char[in.length - to + from];
 115       System.arraycopy(in, 0, tmp, 0, from);
 116       System.arraycopy(in, to, tmp, from, in.length - to);
 117     }
 118     return tmp;
 119   }
 120
 121   /**
 122    * Returns the last part of 'input' after the last occurrence of 'token'. For
 123    * example to extract only the filename from a full path or URL.
 124    *
 125    * @param input
 126    * @param token
 127    *          a delimiter which must be in regular expression format
 128    * @return
 129    */
 130   public static String getLastToken(String input, String token)
 131   {
 132     if (input == null)
 133     {
 134       return null;
 135     }
 136     if (token == null)
 137     {
 138       return input;
 139     }
 140     String[] st = input.split(token);
 141     return st[st.length - 1];
 142   }
 143
 144   /**
 145    * Parses the input string into components separated by the delimiter. Unlike
 146    * String.split(), this method will ignore occurrences of the delimiter which
 147    * are nested within single quotes in name-value pair values, e.g. a='b,c'.
 148    *
 149    * @param input
 150    * @param delimiter
 151    * @return elements separated by separator
 152    */
 153   public static String[] separatorListToArray(String input,
 154           String delimiter)
 155   {
 156     int seplen = delimiter.length();
 157     if (input == null || input.equals("") || input.equals(delimiter))
 158     {
 159       return null;
 160     }
 161     List<String> jv = new ArrayList<>();
 162     int cp = 0, pos, escape;
 163     boolean wasescaped = false, wasquoted = false;
 164     String lstitem = null;
 165     while ((pos = input.indexOf(delimiter, cp)) >= cp)
 166     {
 167       escape = (pos > 0 && input.charAt(pos - 1) == '\\') ? -1 : 0;
 168       if (wasescaped || wasquoted)
 169       {
 170         // append to previous pos
 171         jv.set(jv.size() - 1, lstitem = lstitem + delimiter
 172                 + input.substring(cp, pos + escape));
 173       }
 174       else
 175       {
 176         jv.add(lstitem = input.substring(cp, pos + escape));
 177       }
 178       cp = pos + seplen;
 179       wasescaped = escape == -1;
 180       // last separator may be in an unmatched quote
 181       wasquoted = DELIMITERS_PATTERN.matcher(lstitem).matches();
 182     }
 183     if (cp < input.length())
 184     {
 185       String c = input.substring(cp);
 186       if (wasescaped || wasquoted)
 187       {
 188         // append final separator
 189         jv.set(jv.size() - 1, lstitem + delimiter + c);
 190       }
 191       else
 192       {
 193         if (!c.equals(delimiter))
 194         {
 195           jv.add(c);
 196         }
 197       }
 198     }
 199     if (jv.size() > 0)
 200     {
 201       String[] v = jv.toArray(new String[jv.size()]);
 202       jv.clear();
 203       if (DEBUG)
 204       {
 205         System.err.println("Array from '" + delimiter
 206                 + "' separated List:\n" + v.length);
 207         for (int i = 0; i < v.length; i++)
 208         {
 209           System.err.println("item " + i + " '" + v[i] + "'");
 210         }
 211       }
 212       return v;
 213     }
 214     if (DEBUG)
 215     {
 216       System.err.println(
 217               "Empty Array from '" + delimiter + "' separated List");
 218     }
 219     return null;
 220   }
 221
 222   /**
 223    * Returns a string which contains the list elements delimited by the
 224    * separator. Null items are ignored. If the input is null or has length zero,
 225    * a single delimiter is returned.
 226    *
 227    * @param list
 228    * @param separator
 229    * @return concatenated string
 230    */
 231   public static String arrayToSeparatorList(String[] list, String separator)
 232   {
 233     StringBuffer v = new StringBuffer();
 234     if (list != null && list.length > 0)
 235     {
 236       for (int i = 0, iSize = list.length; i < iSize; i++)
 237       {
 238         if (list[i] != null)
 239         {
 240           if (v.length() > 0)
 241           {
 242             v.append(separator);
 243           }
 244           // TODO - escape any separator values in list[i]
 245           v.append(list[i]);
 246         }
 247       }
 248       if (DEBUG)
 249       {
 250         System.err
 251                 .println("Returning '" + separator + "' separated List:\n");
 252         System.err.println(v);
 253       }
 254       return v.toString();
 255     }
 256     if (DEBUG)
 257     {
 258       System.err.println(
 259               "Returning empty '" + separator + "' separated List\n");
 260     }
 261     return "" + separator;
 262   }
 263
 264   /**
 265    * Converts a list to a string with a delimiter before each term except the
 266    * first. Returns an empty string given a null or zero-length argument. This
 267    * can be replaced with StringJoiner in Java 8.
 268    *
 269    * @param terms
 270    * @param delim
 271    * @return
 272    */
 273   public static String listToDelimitedString(List<String> terms,
 274           String delim)
 275   {
 276     StringBuilder sb = new StringBuilder(32);
 277     if (terms != null && !terms.isEmpty())
 278     {
 279       boolean appended = false;
 280       for (String term : terms)
 281       {
 282         if (appended)
 283         {
 284           sb.append(delim);
 285         }
 286         appended = true;
 287         sb.append(term);
 288       }
 289     }
 290     return sb.toString();
 291   }
 292
 293   /**
 294    * Convenience method to parse a string to an integer, returning 0 if the
 295    * input is null or not a valid integer
 296    *
 297    * @param s
 298    * @return
 299    */
 300   public static int parseInt(String s)
 301   {
 302     int result = 0;
 303     if (s != null && s.length() > 0)
 304     {
 305       try
 306       {
 307         result = Integer.parseInt(s);
 308       } catch (NumberFormatException ex)
 309       {
 310       }
 311     }
 312     return result;
 313   }
 314
 315   /**
 316    * Compares two versions formatted as e.g. "3.4.5" and returns -1, 0 or 1 as
 317    * the first version precedes, is equal to, or follows the second
 318    *
 319    * @param v1
 320    * @param v2
 321    * @return
 322    */
 323   public static int compareVersions(String v1, String v2)
 324   {
 325     return compareVersions(v1, v2, null);
 326   }
 327
 328   /**
 329    * Compares two versions formatted as e.g. "3.4.5b1" and returns -1, 0 or 1 as
 330    * the first version precedes, is equal to, or follows the second
 331    *
 332    * @param v1
 333    * @param v2
 334    * @param pointSeparator
 335    *          a string used to delimit point increments in sub-tokens of the
 336    *          version
 337    * @return
 338    */
 339   public static int compareVersions(String v1, String v2,
 340           String pointSeparator)
 341   {
 342     if (v1 == null || v2 == null)
 343     {
 344       return 0;
 345     }
 346     String[] toks1 = v1.split("\\.");
 347     String[] toks2 = v2.split("\\.");
 348     int i = 0;
 349     for (; i < toks1.length; i++)
 350     {
 351       if (i >= toks2.length)
 352       {
 353         /*
 354          * extra tokens in v1
 355          */
 356         return 1;
 357       }
 358       String tok1 = toks1[i];
 359       String tok2 = toks2[i];
 360       if (pointSeparator != null)
 361       {
 362         /*
 363          * convert e.g. 5b2 into decimal 5.2 for comparison purposes
 364          */
 365         tok1 = tok1.replace(pointSeparator, ".");
 366         tok2 = tok2.replace(pointSeparator, ".");
 367       }
 368       try
 369       {
 370         float f1 = Float.valueOf(tok1);
 371         float f2 = Float.valueOf(tok2);
 372         int comp = Float.compare(f1, f2);
 373         if (comp != 0)
 374         {
 375           return comp;
 376         }
 377       } catch (NumberFormatException e)
 378       {
 379         System.err
 380                 .println("Invalid version format found: " + e.getMessage());
 381         return 0;
 382       }
 383     }
 384
 385     if (i < toks2.length)
 386     {
 387       /*
 388        * extra tokens in v2
 389        */
 390       return -1;
 391     }
 392
 393     /*
 394      * same length, all tokens match
 395      */
 396     return 0;
 397   }
 398
 399   /**
 400    * Converts the string to all lower-case except the first character which is
 401    * upper-cased
 402    *
 403    * @param s
 404    * @return
 405    */
 406   public static String toSentenceCase(String s)
 407   {
 408     if (s == null)
 409     {
 410       return s;
 411     }
 412     if (s.length() <= 1)
 413     {
 414       return s.toUpperCase(Locale.ROOT);
 415     }
 416     return s.substring(0, 1).toUpperCase(Locale.ROOT)
 417             + s.substring(1).toLowerCase(Locale.ROOT);
 418   }
 419
 420   /**
 421    * A helper method that strips off any leading or trailing html and body tags.
 422    * If no html tag is found, then also html-encodes angle bracket characters.
 423    *
 424    * @param text
 425    * @return
 426    */
 427   public static String stripHtmlTags(String text)
 428   {
 429     if (text == null)
 430     {
 431       return null;
 432     }
 433     String tmp2up = text.toUpperCase(Locale.ROOT);
 434     int startTag = tmp2up.indexOf("<HTML>");
 435     if (startTag > -1)
 436     {
 437       text = text.substring(startTag + 6);
 438       tmp2up = tmp2up.substring(startTag + 6);
 439     }
 440     // is omission of "<BODY>" intentional here??
 441     int endTag = tmp2up.indexOf("</BODY>");
 442     if (endTag > -1)
 443     {
 444       text = text.substring(0, endTag);
 445       tmp2up = tmp2up.substring(0, endTag);
 446     }
 447     endTag = tmp2up.indexOf("</HTML>");
 448     if (endTag > -1)
 449     {
 450       text = text.substring(0, endTag);
 451     }
 452
 453     if (startTag == -1 && (text.contains("<") || text.contains(">")))
 454     {
 455       text = text.replaceAll("<", "&lt;");
 456       text = text.replaceAll(">", "&gt;");
 457     }
 458     return text;
 459   }
 460
 461   /**
 462    * Answers the input string with any occurrences of the 'encodeable'
 463    * characters replaced by their URL encoding
 464    *
 465    * @param s
 466    * @param encodable
 467    * @return
 468    */
 469   public static String urlEncode(String s, String encodable)
 470   {
 471     if (s == null || s.isEmpty())
 472     {
 473       return s;
 474     }
 475
 476     /*
 477      * do % encoding first, as otherwise it may double-encode!
 478      */
 479     if (encodable.indexOf(PERCENT) != -1)
 480     {
 481       s = urlEncode(s, PERCENT);
 482     }
 483
 484     for (char c : encodable.toCharArray())
 485     {
 486       if (c != PERCENT)
 487       {
 488         s = urlEncode(s, c);
 489       }
 490     }
 491     return s;
 492   }
 493
 494   /**
 495    * Answers the input string with any occurrences of {@code c} replaced with
 496    * their url encoding. Answers the input string if it is unchanged.
 497    *
 498    * @param s
 499    * @param c
 500    * @return
 501    */
 502   static String urlEncode(String s, char c)
 503   {
 504     String decoded = String.valueOf(c);
 505     if (s.indexOf(decoded) != -1)
 506     {
 507       String encoded = getUrlEncoding(c);
 508       if (!encoded.equals(decoded))
 509       {
 510         s = s.replace(decoded, encoded);
 511       }
 512     }
 513     return s;
 514   }
 515
 516   /**
 517    * Answers the input string with any occurrences of the specified (unencoded)
 518    * characters replaced by their URL decoding.
 519    * <p>
 520    * Example: {@code urlDecode("a%3Db%3Bc", "-;=,")} should answer
 521    * {@code "a=b;c"}.
 522    *
 523    * @param s
 524    * @param encodable
 525    * @return
 526    */
 527   public static String urlDecode(String s, String encodable)
 528   {
 529     if (s == null || s.isEmpty())
 530     {
 531       return s;
 532     }
 533
 534     for (char c : encodable.toCharArray())
 535     {
 536       String encoded = getUrlEncoding(c);
 537       if (s.indexOf(encoded) != -1)
 538       {
 539         String decoded = String.valueOf(c);
 540         s = s.replace(encoded, decoded);
 541       }
 542     }
 543     return s;
 544   }
 545
 546   /**
 547    * Does a lazy lookup of the url encoding of the given character, saving the
 548    * value for repeat lookups
 549    *
 550    * @param c
 551    * @return
 552    */
 553   private static String getUrlEncoding(char c)
 554   {
 555     if (c < 0 || c >= urlEncodings.length)
 556     {
 557       return String.valueOf(c);
 558     }
 559
 560     String enc = urlEncodings[c];
 561     if (enc == null)
 562     {
 563       try
 564       {
 565         enc = urlEncodings[c] = URLEncoder.encode(String.valueOf(c),
 566                 "UTF-8");
 567       } catch (UnsupportedEncodingException e)
 568       {
 569         enc = urlEncodings[c] = String.valueOf(c);
 570       }
 571     }
 572     return enc;
 573   }
 574
 575   public static int firstCharPosIgnoreCase(String text, String chars)
 576   {
 577     int min = text.length() + 1;
 578     for (char c : chars.toLowerCase(Locale.ROOT).toCharArray())
 579     {
 580       int i = text.toLowerCase(Locale.ROOT).indexOf(c);
 581       if (0 <= i && i < min)
 582       {
 583         min = i;
 584       }
 585     }
 586     return min < text.length() + 1 ? min : -1;
 587   }
 588
 589   public static int indexOfFirstWhitespace(String text)
 590   {
 591     int index = -1;
 592     Pattern pat = Pattern.compile("\\s");
 593     Matcher m = pat.matcher(text);
 594     if (m.find())
 595     {
 596       index = m.start();
 597     }
 598     return index;
 599   }
 600
 601   /*
 602    * implementation of String.replaceLast.
 603    * Replaces only the last occurrence of toReplace in string with replacement.
 604    */
 605   public static String replaceLast(String string, String toReplace,
 606           String replacement)
 607   {
 608     int pos = string.lastIndexOf(toReplace);
 609     if (pos > -1)
 610     {
 611       return new StringBuilder().append(string.substring(0, pos))
 612               .append(replacement)
 613               .append(string.substring(pos + toReplace.length()))
 614               .toString();
 615     }
 616     else
 617     {
 618       return string;
 619     }
 620   }
 621
 622 }