1 <?php
2 /**
3 * Part of the Joomla Framework String Package
4 *
5 * @copyright Copyright (C) 2005 - 2015 Open Source Matters, Inc. All rights reserved.
6 * @license GNU General Public License version 2 or later; see LICENSE
7 */
8
9 namespace Joomla\String;
10
11 // PHP mbstring and iconv local configuration
12 if (version_compare(PHP_VERSION, '5.6', '>='))
13 {
14 @ini_set('default_charset', 'UTF-8');
15 }
16 else
17 {
18 // Check if mbstring extension is loaded and attempt to load it if not present except for windows
19 if (extension_loaded('mbstring'))
20 {
21 @ini_set('mbstring.internal_encoding', 'UTF-8');
22 @ini_set('mbstring.http_input', 'UTF-8');
23 @ini_set('mbstring.http_output', 'UTF-8');
24 }
25
26 // Same for iconv
27 if (function_exists('iconv'))
28 {
29 iconv_set_encoding('internal_encoding', 'UTF-8');
30 iconv_set_encoding('input_encoding', 'UTF-8');
31 iconv_set_encoding('output_encoding', 'UTF-8');
32 }
33 }
34
35 /**
36 * String handling class for UTF-8 data wrapping the phputf8 library. All functions assume the validity of UTF-8 strings.
37 *
38 * @since 1.3.0
39 */
40 abstract class StringHelper
41 {
42 /**
43 * Increment styles.
44 *
45 * @var array
46 * @since 1.3.0
47 */
48 protected static $incrementStyles = array(
49 'dash' => array(
50 '#-(\d+)$#',
51 '-%d'
52 ),
53 'default' => array(
54 array('#\((\d+)\)$#', '#\(\d+\)$#'),
55 array(' (%d)', '(%d)'),
56 ),
57 );
58
59 /**
60 * Increments a trailing number in a string.
61 *
62 * Used to easily create distinct labels when copying objects. The method has the following styles:
63 *
64 * default: "Label" becomes "Label (2)"
65 * dash: "Label" becomes "Label-2"
66 *
67 * @param string $string The source string.
68 * @param string $style The the style (default|dash).
69 * @param integer $n If supplied, this number is used for the copy, otherwise it is the 'next' number.
70 *
71 * @return string The incremented string.
72 *
73 * @since 1.3.0
74 */
75 public static function increment($string, $style = 'default', $n = 0)
76 {
77 $styleSpec = isset(static::$incrementStyles[$style]) ? static::$incrementStyles[$style] : static::$incrementStyles['default'];
78
79 // Regular expression search and replace patterns.
80 if (is_array($styleSpec[0]))
81 {
82 $rxSearch = $styleSpec[0][0];
83 $rxReplace = $styleSpec[0][1];
84 }
85 else
86 {
87 $rxSearch = $rxReplace = $styleSpec[0];
88 }
89
90 // New and old (existing) sprintf formats.
91 if (is_array($styleSpec[1]))
92 {
93 $newFormat = $styleSpec[1][0];
94 $oldFormat = $styleSpec[1][1];
95 }
96 else
97 {
98 $newFormat = $oldFormat = $styleSpec[1];
99 }
100
101 // Check if we are incrementing an existing pattern, or appending a new one.
102 if (preg_match($rxSearch, $string, $matches))
103 {
104 $n = empty($n) ? ($matches[1] + 1) : $n;
105 $string = preg_replace($rxReplace, sprintf($oldFormat, $n), $string);
106 }
107 else
108 {
109 $n = empty($n) ? 2 : $n;
110 $string .= sprintf($newFormat, $n);
111 }
112
113 return $string;
114 }
115
116 /**
117 * Tests whether a string contains only 7bit ASCII bytes.
118 *
119 * You might use this to conditionally check whether a string needs handling as UTF-8 or not, potentially offering performance
120 * benefits by using the native PHP equivalent if it's just ASCII e.g.;
121 *
122 * <code>
123 * if (StringHelper::is_ascii($someString))
124 * {
125 * // It's just ASCII - use the native PHP version
126 * $someString = strtolower($someString);
127 * }
128 * else
129 * {
130 * $someString = StringHelper::strtolower($someString);
131 * }
132 * </code>
133 *
134 * @param string $str The string to test.
135 *
136 * @return boolean True if the string is all ASCII
137 *
138 * @since 1.3.0
139 */
140 public static function is_ascii($str)
141 {
142 return utf8_is_ascii($str);
143 }
144
145 /**
146 * UTF-8 aware alternative to ord()
147 *
148 * Returns the unicode ordinal for a character.
149 *
150 * @param string $chr UTF-8 encoded character
151 *
152 * @return integer Unicode ordinal for the character
153 *
154 * @see http://www.php.net/ord
155 * @since 1.4.0
156 */
157 public static function ord($chr)
158 {
159 return utf8_ord($chr);
160 }
161
162 /**
163 * UTF-8 aware alternative to strpos()
164 *
165 * Find position of first occurrence of a string.
166 *
167 * @param string $str String being examined
168 * @param string $search String being searched for
169 * @param integer $offset Optional, specifies the position from which the search should be performed
170 *
171 * @return mixed Number of characters before the first match or FALSE on failure
172 *
173 * @see http://www.php.net/strpos
174 * @since 1.3.0
175 */
176 public static function strpos($str, $search, $offset = false)
177 {
178 if ($offset === false)
179 {
180 return utf8_strpos($str, $search);
181 }
182
183 return utf8_strpos($str, $search, $offset);
184 }
185
186 /**
187 * UTF-8 aware alternative to strrpos()
188 *
189 * Finds position of last occurrence of a string.
190 *
191 * @param string $str String being examined.
192 * @param string $search String being searched for.
193 * @param integer $offset Offset from the left of the string.
194 *
195 * @return mixed Number of characters before the last match or false on failure
196 *
197 * @see http://www.php.net/strrpos
198 * @since 1.3.0
199 */
200 public static function strrpos($str, $search, $offset = 0)
201 {
202 return utf8_strrpos($str, $search, $offset);
203 }
204
205 /**
206 * UTF-8 aware alternative to substr()
207 *
208 * Return part of a string given character offset (and optionally length).
209 *
210 * @param string $str String being processed
211 * @param integer $offset Number of UTF-8 characters offset (from left)
212 * @param integer $length Optional length in UTF-8 characters from offset
213 *
214 * @return mixed string or FALSE if failure
215 *
216 * @see http://www.php.net/substr
217 * @since 1.3.0
218 */
219 public static function substr($str, $offset, $length = false)
220 {
221 if ($length === false)
222 {
223 return utf8_substr($str, $offset);
224 }
225
226 return utf8_substr($str, $offset, $length);
227 }
228
229 /**
230 * UTF-8 aware alternative to strtolower()
231 *
232 * Make a string lowercase
233 *
234 * Note: The concept of a characters "case" only exists is some alphabets such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
235 * not exist in the Chinese alphabet, for example. See Unicode Standard Annex #21: Case Mappings
236 *
237 * @param string $str String being processed
238 *
239 * @return mixed Either string in lowercase or FALSE is UTF-8 invalid
240 *
241 * @see http://www.php.net/strtolower
242 * @since 1.3.0
243 */
244 public static function strtolower($str)
245 {
246 return utf8_strtolower($str);
247 }
248
249 /**
250 * UTF-8 aware alternative to strtoupper()
251 *
252 * Make a string uppercase
253 *
254 * Note: The concept of a characters "case" only exists is some alphabets such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
255 * not exist in the Chinese alphabet, for example. See Unicode Standard Annex #21: Case Mappings
256 *
257 * @param string $str String being processed
258 *
259 * @return mixed Either string in uppercase or FALSE is UTF-8 invalid
260 *
261 * @see http://www.php.net/strtoupper
262 * @since 1.3.0
263 */
264 public static function strtoupper($str)
265 {
266 return utf8_strtoupper($str);
267 }
268
269 /**
270 * UTF-8 aware alternative to strlen()
271 *
272 * Returns the number of characters in the string (NOT THE NUMBER OF BYTES).
273 *
274 * @param string $str UTF-8 string.
275 *
276 * @return integer Number of UTF-8 characters in string.
277 *
278 * @see http://www.php.net/strlen
279 * @since 1.3.0
280 */
281 public static function strlen($str)
282 {
283 return utf8_strlen($str);
284 }
285
286 /**
287 * UTF-8 aware alternative to str_ireplace()
288 *
289 * Case-insensitive version of str_replace()
290 *
291 * @param string $search String to search
292 * @param string $replace Existing string to replace
293 * @param string $str New string to replace with
294 * @param integer $count Optional count value to be passed by referene
295 *
296 * @return string UTF-8 String
297 *
298 * @see http://www.php.net/str_ireplace
299 * @since 1.3.0
300 */
301 public static function str_ireplace($search, $replace, $str, $count = null)
302 {
303 if ($count === false)
304 {
305 return utf8_ireplace($search, $replace, $str);
306 }
307
308 return utf8_ireplace($search, $replace, $str, $count);
309 }
310
311 /**
312 * UTF-8 aware alternative to str_pad()
313 *
314 * Pad a string to a certain length with another string.
315 * $padStr may contain multi-byte characters.
316 *
317 * @param string $input The input string.
318 * @param integer $length If the value is negative, less than, or equal to the length of the input string, no padding takes place.
319 * @param string $padStr The string may be truncated if the number of padding characters can't be evenly divided by the string's length.
320 * @param integer $type The type of padding to apply
321 *
322 * @return string
323 *
324 * @see http://www.php.net/str_pad
325 * @since 1.4.0
326 */
327 public static function str_pad($input, $length, $padStr = ' ', $type = STR_PAD_RIGHT)
328 {
329 return utf8_str_pad($input, $length, $padStr, $type);
330 }
331
332 /**
333 * UTF-8 aware alternative to str_split()
334 *
335 * Convert a string to an array.
336 *
337 * @param string $str UTF-8 encoded string to process
338 * @param integer $split_len Number to characters to split string by
339 *
340 * @return array
341 *
342 * @see http://www.php.net/str_split
343 * @since 1.3.0
344 */
345 public static function str_split($str, $split_len = 1)
346 {
347 return utf8_str_split($str, $split_len);
348 }
349
350 /**
351 * UTF-8/LOCALE aware alternative to strcasecmp()
352 *
353 * A case insensitive string comparison.
354 *
355 * @param string $str1 string 1 to compare
356 * @param string $str2 string 2 to compare
357 * @param mixed $locale The locale used by strcoll or false to use classical comparison
358 *
359 * @return integer < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
360 *
361 * @see http://www.php.net/strcasecmp
362 * @see http://www.php.net/strcoll
363 * @see http://www.php.net/setlocale
364 * @since 1.3.0
365 */
366 public static function strcasecmp($str1, $str2, $locale = false)
367 {
368 if ($locale)
369 {
370 // Get current locale
371 $locale0 = setlocale(LC_COLLATE, 0);
372
373 if (!$locale = setlocale(LC_COLLATE, $locale))
374 {
375 $locale = $locale0;
376 }
377
378 // See if we have successfully set locale to UTF-8
379 if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m))
380 {
381 $encoding = 'CP' . $m[1];
382 }
383 elseif (stristr($locale, 'UTF-8') || stristr($locale, 'utf8'))
384 {
385 $encoding = 'UTF-8';
386 }
387 else
388 {
389 $encoding = 'nonrecodable';
390 }
391
392 // If we successfully set encoding it to utf-8 or encoding is sth weird don't recode
393 if ($encoding == 'UTF-8' || $encoding == 'nonrecodable')
394 {
395 return strcoll(utf8_strtolower($str1), utf8_strtolower($str2));
396 }
397
398 return strcoll(
399 static::transcode(utf8_strtolower($str1), 'UTF-8', $encoding),
400 static::transcode(utf8_strtolower($str2), 'UTF-8', $encoding)
401 );
402 }
403
404 return utf8_strcasecmp($str1, $str2);
405 }
406
407 /**
408 * UTF-8/LOCALE aware alternative to strcmp()
409 *
410 * A case sensitive string comparison.
411 *
412 * @param string $str1 string 1 to compare
413 * @param string $str2 string 2 to compare
414 * @param mixed $locale The locale used by strcoll or false to use classical comparison
415 *
416 * @return integer < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
417 *
418 * @see http://www.php.net/strcmp
419 * @see http://www.php.net/strcoll
420 * @see http://www.php.net/setlocale
421 * @since 1.3.0
422 */
423 public static function strcmp($str1, $str2, $locale = false)
424 {
425 if ($locale)
426 {
427 // Get current locale
428 $locale0 = setlocale(LC_COLLATE, 0);
429
430 if (!$locale = setlocale(LC_COLLATE, $locale))
431 {
432 $locale = $locale0;
433 }
434
435 // See if we have successfully set locale to UTF-8
436 if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m))
437 {
438 $encoding = 'CP' . $m[1];
439 }
440 elseif (stristr($locale, 'UTF-8') || stristr($locale, 'utf8'))
441 {
442 $encoding = 'UTF-8';
443 }
444 else
445 {
446 $encoding = 'nonrecodable';
447 }
448
449 // If we successfully set encoding it to utf-8 or encoding is sth weird don't recode
450 if ($encoding == 'UTF-8' || $encoding == 'nonrecodable')
451 {
452 return strcoll($str1, $str2);
453 }
454
455 return strcoll(static::transcode($str1, 'UTF-8', $encoding), static::transcode($str2, 'UTF-8', $encoding));
456 }
457
458 return strcmp($str1, $str2);
459 }
460
461 /**
462 * UTF-8 aware alternative to strcspn()
463 *
464 * Find length of initial segment not matching mask.
465 *
466 * @param string $str The string to process
467 * @param string $mask The mask
468 * @param integer $start Optional starting character position (in characters)
469 * @param integer $length Optional length
470 *
471 * @return integer The length of the initial segment of str1 which does not contain any of the characters in str2
472 *
473 * @see http://www.php.net/strcspn
474 * @since 1.3.0
475 */
476 public static function strcspn($str, $mask, $start = null, $length = null)
477 {
478 if ($start === false && $length === false)
479 {
480 return utf8_strcspn($str, $mask);
481 }
482
483 if ($length === false)
484 {
485 return utf8_strcspn($str, $mask, $start);
486 }
487
488 return utf8_strcspn($str, $mask, $start, $length);
489 }
490
491 /**
492 * UTF-8 aware alternative to stristr()
493 *
494 * Returns all of haystack from the first occurrence of needle to the end. Needle and haystack are examined in a case-insensitive manner to
495 * find the first occurrence of a string using case insensitive comparison.
496 *
497 * @param string $str The haystack
498 * @param string $search The needle
499 *
500 * @return string the sub string
501 *
502 * @see http://www.php.net/stristr
503 * @since 1.3.0
504 */
505 public static function stristr($str, $search)
506 {
507 return utf8_stristr($str, $search);
508 }
509
510 /**
511 * UTF-8 aware alternative to strrev()
512 *
513 * Reverse a string.
514 *
515 * @param string $str String to be reversed
516 *
517 * @return string The string in reverse character order
518 *
519 * @see http://www.php.net/strrev
520 * @since 1.3.0
521 */
522 public static function strrev($str)
523 {
524 return utf8_strrev($str);
525 }
526
527 /**
528 * UTF-8 aware alternative to strspn()
529 *
530 * Find length of initial segment matching mask.
531 *
532 * @param string $str The haystack
533 * @param string $mask The mask
534 * @param integer $start Start optional
535 * @param integer $length Length optional
536 *
537 * @return integer
538 *
539 * @see http://www.php.net/strspn
540 * @since 1.3.0
541 */
542 public static function strspn($str, $mask, $start = null, $length = null)
543 {
544 if ($start === null && $length === null)
545 {
546 return utf8_strspn($str, $mask);
547 }
548
549 if ($length === null)
550 {
551 return utf8_strspn($str, $mask, $start);
552 }
553
554 return utf8_strspn($str, $mask, $start, $length);
555 }
556
557 /**
558 * UTF-8 aware alternative to substr_replace()
559 *
560 * Replace text within a portion of a string.
561 *
562 * @param string $str The haystack
563 * @param string $repl The replacement string
564 * @param integer $start Start
565 * @param integer $length Length (optional)
566 *
567 * @return string
568 *
569 * @see http://www.php.net/substr_replace
570 * @since 1.3.0
571 */
572 public static function substr_replace($str, $repl, $start, $length = null)
573 {
574 // Loaded by library loader
575 if ($length === false)
576 {
577 return utf8_substr_replace($str, $repl, $start);
578 }
579
580 return utf8_substr_replace($str, $repl, $start, $length);
581 }
582
583 /**
584 * UTF-8 aware replacement for ltrim()
585 *
586 * Strip whitespace (or other characters) from the beginning of a string. You only need to use this if you are supplying the charlist
587 * optional arg and it contains UTF-8 characters. Otherwise ltrim will work normally on a UTF-8 string.
588 *
589 * @param string $str The string to be trimmed
590 * @param string $charlist The optional charlist of additional characters to trim
591 *
592 * @return string The trimmed string
593 *
594 * @see http://www.php.net/ltrim
595 * @since 1.3.0
596 */
597 public static function ltrim($str, $charlist = false)
598 {
599 if (empty($charlist) && $charlist !== false)
600 {
601 return $str;
602 }
603
604 if ($charlist === false)
605 {
606 return utf8_ltrim($str);
607 }
608
609 return utf8_ltrim($str, $charlist);
610 }
611
612 /**
613 * UTF-8 aware replacement for rtrim()
614 *
615 * Strip whitespace (or other characters) from the end of a string. You only need to use this if you are supplying the charlist
616 * optional arg and it contains UTF-8 characters. Otherwise rtrim will work normally on a UTF-8 string.
617 *
618 * @param string $str The string to be trimmed
619 * @param string $charlist The optional charlist of additional characters to trim
620 *
621 * @return string The trimmed string
622 *
623 * @see http://www.php.net/rtrim
624 * @since 1.3.0
625 */
626 public static function rtrim($str, $charlist = false)
627 {
628 if (empty($charlist) && $charlist !== false)
629 {
630 return $str;
631 }
632
633 if ($charlist === false)
634 {
635 return utf8_rtrim($str);
636 }
637
638 return utf8_rtrim($str, $charlist);
639 }
640
641 /**
642 * UTF-8 aware replacement for trim()
643 *
644 * Strip whitespace (or other characters) from the beginning and end of a string. You only need to use this if you are supplying the charlist
645 * optional arg and it contains UTF-8 characters. Otherwise trim will work normally on a UTF-8 string
646 *
647 * @param string $str The string to be trimmed
648 * @param string $charlist The optional charlist of additional characters to trim
649 *
650 * @return string The trimmed string
651 *
652 * @see http://www.php.net/trim
653 * @since 1.3.0
654 */
655 public static function trim($str, $charlist = false)
656 {
657 if (empty($charlist) && $charlist !== false)
658 {
659 return $str;
660 }
661
662 if ($charlist === false)
663 {
664 return utf8_trim($str);
665 }
666
667 return utf8_trim($str, $charlist);
668 }
669
670 /**
671 * UTF-8 aware alternative to ucfirst()
672 *
673 * Make a string's first character uppercase or all words' first character uppercase.
674 *
675 * @param string $str String to be processed
676 * @param string $delimiter The words delimiter (null means do not split the string)
677 * @param string $newDelimiter The new words delimiter (null means equal to $delimiter)
678 *
679 * @return string If $delimiter is null, return the string with first character as upper case (if applicable)
680 * else consider the string of words separated by the delimiter, apply the ucfirst to each words
681 * and return the string with the new delimiter
682 *
683 * @see http://www.php.net/ucfirst
684 * @since 1.3.0
685 */
686 public static function ucfirst($str, $delimiter = null, $newDelimiter = null)
687 {
688 if ($delimiter === null)
689 {
690 return utf8_ucfirst($str);
691 }
692
693 if ($newDelimiter === null)
694 {
695 $newDelimiter = $delimiter;
696 }
697
698 return implode($newDelimiter, array_map('utf8_ucfirst', explode($delimiter, $str)));
699 }
700
701 /**
702 * UTF-8 aware alternative to ucwords()
703 *
704 * Uppercase the first character of each word in a string.
705 *
706 * @param string $str String to be processed
707 *
708 * @return string String with first char of each word uppercase
709 *
710 * @see http://www.php.net/ucwords
711 * @since 1.3.0
712 */
713 public static function ucwords($str)
714 {
715 return utf8_ucwords($str);
716 }
717
718 /**
719 * Transcode a string.
720 *
721 * @param string $source The string to transcode.
722 * @param string $from_encoding The source encoding.
723 * @param string $to_encoding The target encoding.
724 *
725 * @return mixed The transcoded string, or null if the source was not a string.
726 *
727 * @link https://bugs.php.net/bug.php?id=48147
728 *
729 * @since 1.3.0
730 */
731 public static function transcode($source, $from_encoding, $to_encoding)
732 {
733 if (is_string($source))
734 {
735 switch (ICONV_IMPL)
736 {
737 case 'glibc':
738 return @iconv($from_encoding, $to_encoding . '//TRANSLIT,IGNORE', $source);
739
740 case 'libiconv':
741 default:
742 return iconv($from_encoding, $to_encoding . '//IGNORE//TRANSLIT', $source);
743 }
744 }
745
746 return null;
747 }
748
749 /**
750 * Tests a string as to whether it's valid UTF-8 and supported by the Unicode standard.
751 *
752 * Note: this function has been modified to simple return true or false.
753 *
754 * @param string $str UTF-8 encoded string.
755 *
756 * @return boolean true if valid
757 *
758 * @author <hsivonen@iki.fi>
759 * @see http://hsivonen.iki.fi/php-utf8/
760 * @see compliant
761 * @since 1.3.0
762 */
763 public static function valid($str)
764 {
765 return utf8_is_valid($str);
766 }
767
768 /**
769 * Tests whether a string complies as UTF-8.
770 *
771 * This will be much faster than StringHelper::valid() but will pass five and six octet UTF-8 sequences, which are not supported by Unicode and
772 * so cannot be displayed correctly in a browser. In other words it is not as strict as StringHelper::valid() but it's faster. If you use it to
773 * validate user input, you place yourself at the risk that attackers will be able to inject 5 and 6 byte sequences (which may or may not be a
774 * significant risk, depending on what you are are doing).
775 *
776 * @param string $str UTF-8 string to check
777 *
778 * @return boolean TRUE if string is valid UTF-8
779 *
780 * @see StringHelper::valid
781 * @see http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805
782 * @since 1.3.0
783 */
784 public static function compliant($str)
785 {
786 return utf8_compliant($str);
787 }
788
789 /**
790 * Converts Unicode sequences to UTF-8 string.
791 *
792 * @param string $str Unicode string to convert
793 *
794 * @return string UTF-8 string
795 *
796 * @since 1.3.0
797 */
798 public static function unicode_to_utf8($str)
799 {
800 if (extension_loaded('mbstring'))
801 {
802 return preg_replace_callback(
803 '/\\\\u([0-9a-fA-F]{4})/',
804 function ($match)
805 {
806 return mb_convert_encoding(pack('H*', $match[1]), 'UTF-8', 'UCS-2BE');
807 },
808 $str
809 );
810 }
811
812 return $str;
813 }
814
815 /**
816 * Converts Unicode sequences to UTF-16 string.
817 *
818 * @param string $str Unicode string to convert
819 *
820 * @return string UTF-16 string
821 *
822 * @since 1.3.0
823 */
824 public static function unicode_to_utf16($str)
825 {
826 if (extension_loaded('mbstring'))
827 {
828 return preg_replace_callback(
829 '/\\\\u([0-9a-fA-F]{4})/',
830 function ($match)
831 {
832 return mb_convert_encoding(pack('H*', $match[1]), 'UTF-8', 'UTF-16BE');
833 },
834 $str
835 );
836 }
837
838 return $str;
839 }
840 }
841