1 <?php
2 /**
3 * Locate a byte index given a UTF-8 character index
4 * @package utf8
5 */
6
7 //--------------------------------------------------------------------
8 /**
9 * Given a string and a character index in the string, in
10 * terms of the UTF-8 character position, returns the byte
11 * index of that character. Can be useful when you want to
12 * PHP's native string functions but we warned, locating
13 * the byte can be expensive
14 * Takes variable number of parameters - first must be
15 * the search string then 1 to n UTF-8 character positions
16 * to obtain byte indexes for - it is more efficient to search
17 * the string for multiple characters at once, than make
18 * repeated calls to this function
19 *
20 * @author Chris Smith<chris@jalakai.co.uk>
21 * @param string string to locate index in
22 * @param int (n times)
23 * @return mixed - int if only one input int, array if more
24 * @return boolean TRUE if it's all ASCII
25 * @package utf8
26 */
27 function utf8_byte_position() {
28
29 $args = func_get_args();
30 $str =& array_shift($args);
31 if (!is_string($str)) return false;
32
33 $result = array();
34
35 // trivial byte index, character offset pair
36 $prev = array(0,0);
37
38 // use a short piece of str to estimate bytes per character
39 // $i (& $j) -> byte indexes into $str
40 $i = utf8_locate_next_chr($str, 300);
41
42 // $c -> character offset into $str
43 $c = strlen(utf8_decode(substr($str,0,$i)));
44
45 // deal with arguments from lowest to highest
46 sort($args);
47
48 foreach ($args as $offset) {
49 // sanity checks FIXME
50
51 // 0 is an easy check
52 if ($offset == 0) { $result[] = 0; continue; }
53
54 // ensure no endless looping
55 $safety_valve = 50;
56
57 do {
58
59 if ( ($c - $prev[1]) == 0 ) {
60 // Hack: gone past end of string
61 $error = 0;
62 $i = strlen($str);
63 break;
64 }
65
66 $j = $i + (int)(($offset-$c) * ($i - $prev[0]) / ($c - $prev[1]));
67
68 // correct to utf8 character boundary
69 $j = utf8_locate_next_chr($str, $j);
70
71 // save the index, offset for use next iteration
72 $prev = array($i,$c);
73
74 if ($j > $i) {
75 // determine new character offset
76 $c += strlen(utf8_decode(substr($str,$i,$j-$i)));
77 } else {
78 // ditto
79 $c -= strlen(utf8_decode(substr($str,$j,$i-$j)));
80 }
81
82 $error = abs($c-$offset);
83
84 // ready for next time around
85 $i = $j;
86
87 // from 7 it is faster to iterate over the string
88 } while ( ($error > 7) && --$safety_valve) ;
89
90 if ($error && $error <= 7) {
91
92 if ($c < $offset) {
93 // move up
94 while ($error--) { $i = utf8_locate_next_chr($str,++$i); }
95 } else {
96 // move down
97 while ($error--) { $i = utf8_locate_current_chr($str,--$i); }
98 }
99
100 // ready for next arg
101 $c = $offset;
102 }
103 $result[] = $i;
104 }
105
106 if ( count($result) == 1 ) {
107 return $result[0];
108 }
109
110 return $result;
111 }
112
113 //--------------------------------------------------------------------
114 /**
115 * Given a string and any byte index, returns the byte index
116 * of the start of the current UTF-8 character, relative to supplied
117 * position. If the current character begins at the same place as the
118 * supplied byte index, that byte index will be returned. Otherwise
119 * this function will step backwards, looking for the index where
120 * curent UTF-8 character begins
121 * @author Chris Smith<chris@jalakai.co.uk>
122 * @param string
123 * @param int byte index in the string
124 * @return int byte index of start of next UTF-8 character
125 * @package utf8
126 */
127 function utf8_locate_current_chr( &$str, $idx ) {
128
129 if ($idx <= 0) return 0;
130
131 $limit = strlen($str);
132 if ($idx >= $limit) return $limit;
133
134 // Binary value for any byte after the first in a multi-byte UTF-8 character
135 // will be like 10xxxxxx so & 0xC0 can be used to detect this kind
136 // of byte - assuming well formed UTF-8
137 while ($idx && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx--;
138
139 return $idx;
140 }
141
142 //--------------------------------------------------------------------
143 /**
144 * Given a string and any byte index, returns the byte index
145 * of the start of the next UTF-8 character, relative to supplied
146 * position. If the next character begins at the same place as the
147 * supplied byte index, that byte index will be returned.
148 * @author Chris Smith<chris@jalakai.co.uk>
149 * @param string
150 * @param int byte index in the string
151 * @return int byte index of start of next UTF-8 character
152 * @package utf8
153 */
154 function utf8_locate_next_chr( &$str, $idx ) {
155
156 if ($idx <= 0) return 0;
157
158 $limit = strlen($str);
159 if ($idx >= $limit) return $limit;
160
161 // Binary value for any byte after the first in a multi-byte UTF-8 character
162 // will be like 10xxxxxx so & 0xC0 can be used to detect this kind
163 // of byte - assuming well formed UTF-8
164 while (($idx < $limit) && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx++;
165
166 return $idx;
167 }
168
169