1 <?php
2 /**
3 * Tools for locating / replacing bad bytes in UTF-8 strings
4 * The Original Code is Mozilla Communicator client code.
5 * The Initial Developer of the Original Code is
6 * Netscape Communications Corporation.
7 * Portions created by the Initial Developer are Copyright (C) 1998
8 * the Initial Developer. All Rights Reserved.
9 * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
10 * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
11 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
12 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
13 * @see http://hsivonen.iki.fi/php-utf8/
14 * @package utf8
15 * @see utf8_is_valid
16 */
17
18 //--------------------------------------------------------------------
19 /**
20 * Locates the first bad byte in a UTF-8 string returning it's
21 * byte index in the string
22 * PCRE Pattern to locate bad bytes in a UTF-8 string
23 * Comes from W3 FAQ: Multilingual Forms
24 * Note: modified to include full ASCII range including control chars
25 * @see http://www.w3.org/International/questions/qa-forms-utf-8
26 * @param string
27 * @return mixed integer byte index or FALSE if no bad found
28 * @package utf8
29 */
30 function utf8_bad_find($str) {
31 $UTF8_BAD =
32 '([\x00-\x7F]'. # ASCII (including control chars)
33 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
34 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
35 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
36 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
37 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
38 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
39 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
40 '|(.{1}))'; # invalid byte
41 $pos = 0;
42 $badList = array();
43 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
44 $bytes = strlen($matches[0]);
45 if ( isset($matches[2])) {
46 return $pos;
47 }
48 $pos += $bytes;
49 $str = substr($str,$bytes);
50 }
51 return FALSE;
52 }
53
54 //--------------------------------------------------------------------
55 /**
56 * Locates all bad bytes in a UTF-8 string and returns a list of their
57 * byte index in the string
58 * PCRE Pattern to locate bad bytes in a UTF-8 string
59 * Comes from W3 FAQ: Multilingual Forms
60 * Note: modified to include full ASCII range including control chars
61 * @see http://www.w3.org/International/questions/qa-forms-utf-8
62 * @param string
63 * @return mixed array of integers or FALSE if no bad found
64 * @package utf8
65 */
66 function utf8_bad_findall($str) {
67 $UTF8_BAD =
68 '([\x00-\x7F]'. # ASCII (including control chars)
69 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
70 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
71 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
72 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
73 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
74 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
75 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
76 '|(.{1}))'; # invalid byte
77 $pos = 0;
78 $badList = array();
79 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
80 $bytes = strlen($matches[0]);
81 if ( isset($matches[2])) {
82 $badList[] = $pos;
83 }
84 $pos += $bytes;
85 $str = substr($str,$bytes);
86 }
87 if ( count($badList) > 0 ) {
88 return $badList;
89 }
90 return FALSE;
91 }
92
93 //--------------------------------------------------------------------
94 /**
95 * Strips out any bad bytes from a UTF-8 string and returns the rest
96 * PCRE Pattern to locate bad bytes in a UTF-8 string
97 * Comes from W3 FAQ: Multilingual Forms
98 * Note: modified to include full ASCII range including control chars
99 * @see http://www.w3.org/International/questions/qa-forms-utf-8
100 * @param string
101 * @return string
102 * @package utf8
103 */
104 function utf8_bad_strip($str) {
105 $UTF8_BAD =
106 '([\x00-\x7F]'. # ASCII (including control chars)
107 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
108 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
109 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
110 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
111 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
112 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
113 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
114 '|(.{1}))'; # invalid byte
115 ob_start();
116 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
117 if ( !isset($matches[2])) {
118 echo $matches[0];
119 }
120 $str = substr($str,strlen($matches[0]));
121 }
122 $result = ob_get_contents();
123 ob_end_clean();
124 return $result;
125 }
126
127 //--------------------------------------------------------------------
128 /**
129 * Replace bad bytes with an alternative character - ASCII character
130 * recommended is replacement char
131 * PCRE Pattern to locate bad bytes in a UTF-8 string
132 * Comes from W3 FAQ: Multilingual Forms
133 * Note: modified to include full ASCII range including control chars
134 * @see http://www.w3.org/International/questions/qa-forms-utf-8
135 * @param string to search
136 * @param string to replace bad bytes with (defaults to '?') - use ASCII
137 * @return string
138 * @package utf8
139 */
140 function utf8_bad_replace($str, $replace = '?') {
141 $UTF8_BAD =
142 '([\x00-\x7F]'. # ASCII (including control chars)
143 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
144 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
145 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
146 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
147 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
148 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
149 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
150 '|(.{1}))'; # invalid byte
151 ob_start();
152 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
153 if ( !isset($matches[2])) {
154 echo $matches[0];
155 } else {
156 echo $replace;
157 }
158 $str = substr($str,strlen($matches[0]));
159 }
160 $result = ob_get_contents();
161 ob_end_clean();
162 return $result;
163 }
164
165 //--------------------------------------------------------------------
166 /**
167 * Return code from utf8_bad_identify() when a five octet sequence is detected.
168 * Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so
169 * do not represent a useful character
170 * @see utf8_bad_identify
171 * @package utf8
172 */
173 define('UTF8_BAD_5OCTET',1);
174
175 /**
176 * Return code from utf8_bad_identify() when a six octet sequence is detected.
177 * Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so
178 * do not represent a useful character
179 * @see utf8_bad_identify
180 * @package utf8
181 */
182 define('UTF8_BAD_6OCTET',2);
183
184 /**
185 * Return code from utf8_bad_identify().
186 * Invalid octet for use as start of multi-byte UTF-8 sequence
187 * @see utf8_bad_identify
188 * @package utf8
189 */
190 define('UTF8_BAD_SEQID',3);
191
192 /**
193 * Return code from utf8_bad_identify().
194 * From Unicode 3.1, non-shortest form is illegal
195 * @see utf8_bad_identify
196 * @package utf8
197 */
198 define('UTF8_BAD_NONSHORT',4);
199
200 /**
201 * Return code from utf8_bad_identify().
202 * From Unicode 3.2, surrogate characters are illegal
203 * @see utf8_bad_identify
204 * @package utf8
205 */
206 define('UTF8_BAD_SURROGATE',5);
207
208 /**
209 * Return code from utf8_bad_identify().
210 * Codepoints outside the Unicode range are illegal
211 * @see utf8_bad_identify
212 * @package utf8
213 */
214 define('UTF8_BAD_UNIOUTRANGE',6);
215
216 /**
217 * Return code from utf8_bad_identify().
218 * Incomplete multi-octet sequence
219 * Note: this is kind of a "catch-all"
220 * @see utf8_bad_identify
221 * @package utf8
222 */
223 define('UTF8_BAD_SEQINCOMPLETE',7);
224
225 //--------------------------------------------------------------------
226 /**
227 * Reports on the type of bad byte found in a UTF-8 string. Returns a
228 * status code on the first bad byte found
229 * @author <hsivonen@iki.fi>
230 * @param string UTF-8 encoded string
231 * @return mixed integer constant describing problem or FALSE if valid UTF-8
232 * @see utf8_bad_explain
233 * @see http://hsivonen.iki.fi/php-utf8/
234 * @package utf8
235 */
236 function utf8_bad_identify($str, &$i) {
237
238 $mState = 0; // cached expected number of octets after the current octet
239 // until the beginning of the next UTF8 character sequence
240 $mUcs4 = 0; // cached Unicode character
241 $mBytes = 1; // cached expected number of octets in the current sequence
242
243 $len = strlen($str);
244
245 for($i = 0; $i < $len; $i++) {
246
247 $in = ord($str{$i});
248
249 if ( $mState == 0) {
250
251 // When mState is zero we expect either a US-ASCII character or a
252 // multi-octet sequence.
253 if (0 == (0x80 & ($in))) {
254 // US-ASCII, pass straight through.
255 $mBytes = 1;
256
257 } else if (0xC0 == (0xE0 & ($in))) {
258 // First octet of 2 octet sequence
259 $mUcs4 = ($in);
260 $mUcs4 = ($mUcs4 & 0x1F) << 6;
261 $mState = 1;
262 $mBytes = 2;
263
264 } else if (0xE0 == (0xF0 & ($in))) {
265 // First octet of 3 octet sequence
266 $mUcs4 = ($in);
267 $mUcs4 = ($mUcs4 & 0x0F) << 12;
268 $mState = 2;
269 $mBytes = 3;
270
271 } else if (0xF0 == (0xF8 & ($in))) {
272 // First octet of 4 octet sequence
273 $mUcs4 = ($in);
274 $mUcs4 = ($mUcs4 & 0x07) << 18;
275 $mState = 3;
276 $mBytes = 4;
277
278 } else if (0xF8 == (0xFC & ($in))) {
279
280 /* First octet of 5 octet sequence.
281 *
282 * This is illegal because the encoded codepoint must be either
283 * (a) not the shortest form or
284 * (b) outside the Unicode range of 0-0x10FFFF.
285 */
286
287 return UTF8_BAD_5OCTET;
288
289 } else if (0xFC == (0xFE & ($in))) {
290
291 // First octet of 6 octet sequence, see comments for 5 octet sequence.
292 return UTF8_BAD_6OCTET;
293
294 } else {
295 // Current octet is neither in the US-ASCII range nor a legal first
296 // octet of a multi-octet sequence.
297 return UTF8_BAD_SEQID;
298
299 }
300
301 } else {
302
303 // When mState is non-zero, we expect a continuation of the multi-octet
304 // sequence
305 if (0x80 == (0xC0 & ($in))) {
306
307 // Legal continuation.
308 $shift = ($mState - 1) * 6;
309 $tmp = $in;
310 $tmp = ($tmp & 0x0000003F) << $shift;
311 $mUcs4 |= $tmp;
312
313 /**
314 * End of the multi-octet sequence. mUcs4 now contains the final
315 * Unicode codepoint to be output
316 */
317 if (0 == --$mState) {
318
319 // From Unicode 3.1, non-shortest form is illegal
320 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
321 ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
322 ((4 == $mBytes) && ($mUcs4 < 0x10000)) ) {
323 return UTF8_BAD_NONSHORT;
324
325 // From Unicode 3.2, surrogate characters are illegal
326 } else if (($mUcs4 & 0xFFFFF800) == 0xD800) {
327 return UTF8_BAD_SURROGATE;
328
329 // Codepoints outside the Unicode range are illegal
330 } else if ($mUcs4 > 0x10FFFF) {
331 return UTF8_BAD_UNIOUTRANGE;
332 }
333
334 //initialize UTF8 cache
335 $mState = 0;
336 $mUcs4 = 0;
337 $mBytes = 1;
338 }
339
340 } else {
341 // ((0xC0 & (*in) != 0x80) && (mState != 0))
342 // Incomplete multi-octet sequence.
343 $i--;
344 return UTF8_BAD_SEQINCOMPLETE;
345 }
346 }
347 }
348
349 if ( $mState != 0 ) {
350 // Incomplete multi-octet sequence.
351 $i--;
352 return UTF8_BAD_SEQINCOMPLETE;
353 }
354
355 // No bad octets found
356 $i = NULL;
357 return FALSE;
358 }
359
360 //--------------------------------------------------------------------
361 /**
362 * Takes a return code from utf8_bad_identify() are returns a message
363 * (in English) explaining what the problem is.
364 * @param int return code from utf8_bad_identify
365 * @return mixed string message or FALSE if return code unknown
366 * @see utf8_bad_identify
367 * @package utf8
368 */
369 function utf8_bad_explain($code) {
370
371 switch ($code) {
372
373 case UTF8_BAD_5OCTET:
374 return 'Five octet sequences are valid UTF-8 but are not supported by Unicode';
375 break;
376
377 case UTF8_BAD_6OCTET:
378 return 'Six octet sequences are valid UTF-8 but are not supported by Unicode';
379 break;
380
381 case UTF8_BAD_SEQID:
382 return 'Invalid octet for use as start of multi-byte UTF-8 sequence';
383 break;
384
385 case UTF8_BAD_NONSHORT:
386 return 'From Unicode 3.1, non-shortest form is illegal';
387 break;
388
389 case UTF8_BAD_SURROGATE:
390 return 'From Unicode 3.2, surrogate characters are illegal';
391 break;
392
393 case UTF8_BAD_UNIOUTRANGE:
394 return 'Codepoints outside the Unicode range are illegal';
395 break;
396
397 case UTF8_BAD_SEQINCOMPLETE:
398 return 'Incomplete multi-octet sequence';
399 break;
400
401 }
402
403 trigger_error('Unknown error code: '.$code,E_USER_WARNING);
404 return FALSE;
405
406 }
407