File vendor/joomla/string/src/phputf8/utils/position.php | Joomla! Framework TM

  1 <?php
  2 /**
  3 * Locate a byte index given a UTF-8 character index
  4 * @package utf8
  5 */
  6 
  7 //--------------------------------------------------------------------
  8 /**
  9 * Given a string and a character index in the string, in
 10 * terms of the UTF-8 character position, returns the byte
 11 * index of that character. Can be useful when you want to
 12 * PHP's native string functions but we warned, locating
 13 * the byte can be expensive
 14 * Takes variable number of parameters - first must be
 15 * the search string then 1 to n UTF-8 character positions
 16 * to obtain byte indexes for - it is more efficient to search
 17 * the string for multiple characters at once, than make
 18 * repeated calls to this function
 19 *
 20 * @author Chris Smith<chris@jalakai.co.uk>
 21 * @param string string to locate index in
 22 * @param int (n times)
 23 * @return mixed - int if only one input int, array if more
 24 * @return boolean TRUE if it's all ASCII
 25 * @package utf8
 26 */
 27 function utf8_byte_position() {
 28 
 29     $args = func_get_args();
 30     $str =& array_shift($args);
 31     if (!is_string($str)) return false;
 32 
 33     $result = array();
 34 
 35     // trivial byte index, character offset pair
 36     $prev = array(0,0);
 37 
 38     // use a short piece of str to estimate bytes per character
 39     // $i (& $j) -> byte indexes into $str
 40     $i = utf8_locate_next_chr($str, 300);
 41 
 42     // $c -> character offset into $str
 43     $c = strlen(utf8_decode(substr($str,0,$i)));
 44 
 45     // deal with arguments from lowest to highest
 46     sort($args);
 47 
 48     foreach ($args as $offset) {
 49         // sanity checks FIXME
 50 
 51         // 0 is an easy check
 52         if ($offset == 0) { $result[] = 0; continue; }
 53 
 54         // ensure no endless looping
 55         $safety_valve = 50;
 56 
 57         do {
 58 
 59             if ( ($c - $prev[1]) == 0 ) {
 60                 // Hack: gone past end of string
 61                 $error = 0;
 62                 $i = strlen($str);
 63                 break;
 64             }
 65 
 66             $j = $i + (int)(($offset-$c) * ($i - $prev[0]) / ($c - $prev[1]));
 67 
 68             // correct to utf8 character boundary
 69             $j = utf8_locate_next_chr($str, $j);
 70 
 71             // save the index, offset for use next iteration
 72             $prev = array($i,$c);
 73 
 74             if ($j > $i) {
 75                 // determine new character offset
 76                 $c += strlen(utf8_decode(substr($str,$i,$j-$i)));
 77             } else {
 78                 // ditto
 79                 $c -= strlen(utf8_decode(substr($str,$j,$i-$j)));
 80             }
 81 
 82             $error = abs($c-$offset);
 83 
 84             // ready for next time around
 85             $i = $j;
 86 
 87         // from 7 it is faster to iterate over the string
 88         } while ( ($error > 7) && --$safety_valve) ;
 89 
 90         if ($error && $error <= 7) {
 91 
 92             if ($c < $offset) {
 93                 // move up
 94                 while ($error--) { $i = utf8_locate_next_chr($str,++$i); }
 95             } else {
 96                 // move down
 97                 while ($error--) { $i = utf8_locate_current_chr($str,--$i); }
 98             }
 99 
100             // ready for next arg
101             $c = $offset;
102         }
103         $result[] = $i;
104     }
105 
106     if ( count($result) == 1 ) {
107         return $result[0];
108     }
109 
110     return $result;
111 }
112 
113 //--------------------------------------------------------------------
114 /**
115 * Given a string and any byte index, returns the byte index
116 * of the start of the current UTF-8 character, relative to supplied
117 * position. If the current character begins at the same place as the
118 * supplied byte index, that byte index will be returned. Otherwise
119 * this function will step backwards, looking for the index where
120 * curent UTF-8 character begins
121 * @author Chris Smith<chris@jalakai.co.uk>
122 * @param string
123 * @param int byte index in the string
124 * @return int byte index of start of next UTF-8 character
125 * @package utf8
126 */
127 function utf8_locate_current_chr( &$str, $idx ) {
128 
129     if ($idx <= 0) return 0;
130 
131     $limit = strlen($str);
132     if ($idx >= $limit) return $limit;
133 
134     // Binary value for any byte after the first in a multi-byte UTF-8 character
135     // will be like 10xxxxxx so & 0xC0 can be used to detect this kind
136     // of byte - assuming well formed UTF-8
137     while ($idx && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx--;
138 
139     return $idx;
140 }
141 
142 //--------------------------------------------------------------------
143 /**
144 * Given a string and any byte index, returns the byte index
145 * of the start of the next UTF-8 character, relative to supplied
146 * position. If the next character begins at the same place as the
147 * supplied byte index, that byte index will be returned.
148 * @author Chris Smith<chris@jalakai.co.uk>
149 * @param string
150 * @param int byte index in the string
151 * @return int byte index of start of next UTF-8 character
152 * @package utf8
153 */
154 function utf8_locate_next_chr( &$str, $idx ) {
155 
156     if ($idx <= 0) return 0;
157 
158     $limit = strlen($str);
159     if ($idx >= $limit) return $limit;
160 
161     // Binary value for any byte after the first in a multi-byte UTF-8 character
162     // will be like 10xxxxxx so & 0xC0 can be used to detect this kind
163     // of byte - assuming well formed UTF-8
164     while (($idx < $limit) && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx++;
165 
166     return $idx;
167 }
168 
169
Namespaces

Classes

Interfaces

Exceptions

Constants

Functions