1 <?php
2 // {{{ license
3
4 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
5 //
6 // +----------------------------------------------------------------------+
7 // | This library is free software; you can redistribute it and/or modify |
8 // | it under the terms of the GNU Lesser General Public License as |
9 // | published by the Free Software Foundation; either version 2.1 of the |
10 // | License, or (at your option) any later version. |
11 // | |
12 // | This library is distributed in the hope that it will be useful, but |
13 // | WITHOUT ANY WARRANTY; without even the implied warranty of |
14 // | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 // | Lesser General Public License for more details. |
16 // | |
17 // | You should have received a copy of the GNU Lesser General Public |
18 // | License along with this library; if not, write to the Free Software |
19 // | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
20 // | USA. |
21 // +----------------------------------------------------------------------+
22 //
23
24 // }}}
25
26 /**
27 * Encode/decode Internationalized Domain Names.
28 *
29 * The class allows to convert internationalized domain names
30 * (see RFC 3490 for details) as they can be used with various registries worldwide
31 * to be translated between their original (localized) form and their encoded form
32 * as it will be used in the DNS (Domain Name System).
33 *
34 * The class provides two public methods, encode() and decode(), which do exactly
35 * what you would expect them to do. You are allowed to use complete domain names,
36 * simple strings and complete email addresses as well. That means, that you might
37 * use any of the following notations:
38 *
39 * - www.nörgler.com
40 * - xn--nrgler-wxa
41 * - xn--brse-5qa.xn--knrz-1ra.info
42 *
43 * Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4
44 * array. Unicode output is available in the same formats.
45 * You can select your preferred format via {@link set_paramter()}.
46 *
47 * ACE input and output is always expected to be ASCII.
48 *
49 * @author Matthias Sommerfeld <mso@phlylabs.de>
50 * @copyright 2004-2007 phlyLabs Berlin, http://phlylabs.de
51 * @version 0.5.1
52 *
53 */
54 class idna_convert
55 {
56 /**
57 * Holds all relevant mapping tables, loaded from a seperate file on construct
58 * See RFC3454 for details
59 *
60 * @var array
61 * @access private
62 */
63 var $NP = array();
64
65 // Internal settings, do not mess with them
66 var $_punycode_prefix = 'xn--';
67 var $_invalid_ucs = 0x80000000;
68 var $_max_ucs = 0x10FFFF;
69 var $_base = 36;
70 var $_tmin = 1;
71 var $_tmax = 26;
72 var $_skew = 38;
73 var $_damp = 700;
74 var $_initial_bias = 72;
75 var $_initial_n = 0x80;
76 var $_sbase = 0xAC00;
77 var $_lbase = 0x1100;
78 var $_vbase = 0x1161;
79 var $_tbase = 0x11A7;
80 var $_lcount = 19;
81 var $_vcount = 21;
82 var $_tcount = 28;
83 var $_ncount = 588; // _vcount * _tcount
84 var $_scount = 11172; // _lcount * _tcount * _vcount
85 var $_error = false;
86
87 // See {@link set_paramter()} for details of how to change the following
88 // settings from within your script / application
89 var $_api_encoding = 'utf8'; // Default input charset is UTF-8
90 var $_allow_overlong = false; // Overlong UTF-8 encodings are forbidden
91 var $_strict_mode = false; // Behave strict or not
92
93 // The constructor
94 function idna_convert($options = false)
95 {
96 $this->slast = $this->_sbase + $this->_lcount * $this->_vcount * $this->_tcount;
97 if (function_exists('file_get_contents')) {
98 $this->NP = unserialize(file_get_contents(dirname(__FILE__).'/npdata.ser'));
99 } else {
100 $this->NP = unserialize(join('', file(dirname(__FILE__).'/npdata.ser')));
101 }
102 // If parameters are given, pass these to the respective method
103 if (is_array($options)) {
104 return $this->set_parameter($options);
105 }
106 return true;
107 }
108
109 /**
110 * Sets a new option value. Available options and values:
111 * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
112 * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
113 * [overlong - Unicode does not allow unnecessarily long encodings of chars,
114 * to allow this, set this parameter to true, else to false;
115 * default is false.]
116 * [strict - true: strict mode, good for registration purposes - Causes errors
117 * on failures; false: loose mode, ideal for "wildlife" applications
118 * by silently ignoring errors and returning the original input instead
119 *
120 * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
121 * @param string Value to use (if parameter 1 is a string)
122 * @return boolean true on success, false otherwise
123 * @access public
124 */
125 function set_parameter($option, $value = false)
126 {
127 if (!is_array($option)) {
128 $option = array($option => $value);
129 }
130 foreach ($option as $k => $v) {
131 switch ($k) {
132 case 'encoding':
133 switch ($v) {
134 case 'utf8':
135 case 'ucs4_string':
136 case 'ucs4_array':
137 $this->_api_encoding = $v;
138 break;
139 default:
140 $this->_error('Set Parameter: Unknown parameter '.$v.' for option '.$k);
141 return false;
142 }
143 break;
144 case 'overlong':
145 $this->_allow_overlong = ($v) ? true : false;
146 break;
147 case 'strict':
148 $this->_strict_mode = ($v) ? true : false;
149 break;
150 default:
151 $this->_error('Set Parameter: Unknown option '.$k);
152 return false;
153 }
154 }
155 return true;
156 }
157
158 /**
159 * Decode a given ACE domain name
160 * @param string Domain name (ACE string)
161 * [@param string Desired output encoding, see {@link set_parameter}]
162 * @return string Decoded Domain name (UTF-8 or UCS-4)
163 * @access public
164 */
165 function decode($input, $one_time_encoding = false)
166 {
167 // Optionally set
168 if ($one_time_encoding) {
169 switch ($one_time_encoding) {
170 case 'utf8':
171 case 'ucs4_string':
172 case 'ucs4_array':
173 break;
174 default:
175 $this->_error('Unknown encoding '.$one_time_encoding);
176 return false;
177 }
178 }
179 // Make sure to drop any newline characters around
180 $input = trim($input);
181
182 // Negotiate input and try to determine, whether it is a plain string,
183 // an email address or something like a complete URL
184 if (strpos($input, '@')) { // Maybe it is an email address
185 // No no in strict mode
186 if ($this->_strict_mode) {
187 $this->_error('Only simple domain name parts can be handled in strict mode');
188 return false;
189 }
190 list ($email_pref, $input) = explode('@', $input, 2);
191 $arr = explode('.', $input);
192 foreach ($arr as $k => $v) {
193 if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
194 $conv = $this->_decode($v);
195 if ($conv) $arr[$k] = $conv;
196 }
197 }
198 $input = join('.', $arr);
199 $arr = explode('.', $email_pref);
200 foreach ($arr as $k => $v) {
201 if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
202 $conv = $this->_decode($v);
203 if ($conv) $arr[$k] = $conv;
204 }
205 }
206 $email_pref = join('.', $arr);
207 $return = $email_pref . '@' . $input;
208 } elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters)
209 // No no in strict mode
210 if ($this->_strict_mode) {
211 $this->_error('Only simple domain name parts can be handled in strict mode');
212 return false;
213 }
214 $parsed = parse_url($input);
215 if (isset($parsed['host'])) {
216 $arr = explode('.', $parsed['host']);
217 foreach ($arr as $k => $v) {
218 $conv = $this->_decode($v);
219 if ($conv) $arr[$k] = $conv;
220 }
221 $parsed['host'] = join('.', $arr);
222 $return =
223 (empty($parsed['scheme']) ? '' : $parsed['scheme'].(strtolower($parsed['scheme']) == 'mailto' ? ':' : '://'))
224 .(empty($parsed['user']) ? '' : $parsed['user'].(empty($parsed['pass']) ? '' : ':'.$parsed['pass']).'@')
225 .$parsed['host']
226 .(empty($parsed['port']) ? '' : ':'.$parsed['port'])
227 .(empty($parsed['path']) ? '' : $parsed['path'])
228 .(empty($parsed['query']) ? '' : '?'.$parsed['query'])
229 .(empty($parsed['fragment']) ? '' : '#'.$parsed['fragment']);
230 } else { // parse_url seems to have failed, try without it
231 $arr = explode('.', $input);
232 foreach ($arr as $k => $v) {
233 $conv = $this->_decode($v);
234 $arr[$k] = ($conv) ? $conv : $v;
235 }
236 $return = join('.', $arr);
237 }
238 } else { // Otherwise we consider it being a pure domain name string
239 $return = $this->_decode($input);
240 if (!$return) $return = $input;
241 }
242 // The output is UTF-8 by default, other output formats need conversion here
243 // If one time encoding is given, use this, else the objects property
244 switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) {
245 case 'utf8':
246 return $return;
247 break;
248 case 'ucs4_string':
249 return $this->_ucs4_to_ucs4_string($this->_utf8_to_ucs4($return));
250 break;
251 case 'ucs4_array':
252 return $this->_utf8_to_ucs4($return);
253 break;
254 default:
255 $this->_error('Unsupported output format');
256 return false;
257 }
258 }
259
260 /**
261 * Encode a given UTF-8 domain name
262 * @param string Domain name (UTF-8 or UCS-4)
263 * [@param string Desired input encoding, see {@link set_parameter}]
264 * @return string Encoded Domain name (ACE string)
265 * @access public
266 */
267 function encode($decoded, $one_time_encoding = false)
268 {
269 // Forcing conversion of input to UCS4 array
270 // If one time encoding is given, use this, else the objects property
271 switch ($one_time_encoding ? $one_time_encoding : $this->_api_encoding) {
272 case 'utf8':
273 $decoded = $this->_utf8_to_ucs4($decoded);
274 break;
275 case 'ucs4_string':
276 $decoded = $this->_ucs4_string_to_ucs4($decoded);
277 case 'ucs4_array':
278 break;
279 default:
280 $this->_error('Unsupported input format: '.($one_time_encoding ? $one_time_encoding : $this->_api_encoding));
281 return false;
282 }
283
284 // No input, no output, what else did you expect?
285 if (empty($decoded)) return '';
286
287 // Anchors for iteration
288 $last_begin = 0;
289 // Output string
290 $output = '';
291 foreach ($decoded as $k => $v) {
292 // Make sure to use just the plain dot
293 switch($v) {
294 case 0x3002:
295 case 0xFF0E:
296 case 0xFF61:
297 $decoded[$k] = 0x2E;
298 // Right, no break here, the above are converted to dots anyway
299 // Stumbling across an anchoring character
300 case 0x2E:
301 case 0x2F:
302 case 0x3A:
303 case 0x3F:
304 case 0x40:
305 // Neither email addresses nor URLs allowed in strict mode
306 if ($this->_strict_mode) {
307 $this->_error('Neither email addresses nor URLs are allowed in strict mode.');
308 return false;
309 } else {
310 // Skip first char
311 if ($k) {
312 $encoded = '';
313 $encoded = $this->_encode(array_slice($decoded, $last_begin, (($k)-$last_begin)));
314 if ($encoded) {
315 $output .= $encoded;
316 } else {
317 $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($k)-$last_begin)));
318 }
319 $output .= chr($decoded[$k]);
320 }
321 $last_begin = $k + 1;
322 }
323 }
324 }
325 // Catch the rest of the string
326 if ($last_begin) {
327 $inp_len = sizeof($decoded);
328 $encoded = '';
329 $encoded = $this->_encode(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
330 if ($encoded) {
331 $output .= $encoded;
332 } else {
333 $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
334 }
335 return $output;
336 } else {
337 if ($output = $this->_encode($decoded)) {
338 return $output;
339 } else {
340 return $this->_ucs4_to_utf8($decoded);
341 }
342 }
343 }
344
345 /**
346 * Use this method to get the last error ocurred
347 * @param void
348 * @return string The last error, that occured
349 * @access public
350 */
351 function get_last_error()
352 {
353 return $this->_error;
354 }
355
356 /**
357 * The actual decoding algorithm
358 * @access private
359 */
360 function _decode($encoded)
361 {
362 // We do need to find the Punycode prefix
363 if (!preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $encoded)) {
364 $this->_error('This is not a punycode string');
365 return false;
366 }
367 $encode_test = preg_replace('!^'.preg_quote($this->_punycode_prefix, '!').'!', '', $encoded);
368 // If nothing left after removing the prefix, it is hopeless
369 if (!$encode_test) {
370 $this->_error('The given encoded string was empty');
371 return false;
372 }
373 // Find last occurence of the delimiter
374 $delim_pos = strrpos($encoded, '-');
375 if ($delim_pos > strlen($this->_punycode_prefix)) {
376 for ($k = strlen($this->_punycode_prefix); $k < $delim_pos; ++$k) {
377 $decoded[] = ord($encoded{$k});
378 }
379 } else {
380 $decoded = array();
381 }
382 $deco_len = count($decoded);
383 $enco_len = strlen($encoded);
384
385 // Wandering through the strings; init
386 $is_first = true;
387 $bias = $this->_initial_bias;
388 $idx = 0;
389 $char = $this->_initial_n;
390
391 for ($enco_idx = ($delim_pos) ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) {
392 for ($old_idx = $idx, $w = 1, $k = $this->_base; 1 ; $k += $this->_base) {
393 $digit = $this->_decode_digit($encoded{$enco_idx++});
394 $idx += $digit * $w;
395 $t = ($k <= $bias) ? $this->_tmin :
396 (($k >= $bias + $this->_tmax) ? $this->_tmax : ($k - $bias));
397 if ($digit < $t) break;
398 $w = (int) ($w * ($this->_base - $t));
399 }
400 $bias = $this->_adapt($idx - $old_idx, $deco_len + 1, $is_first);
401 $is_first = false;
402 $char += (int) ($idx / ($deco_len + 1));
403 $idx %= ($deco_len + 1);
404 if ($deco_len > 0) {
405 // Make room for the decoded char
406 for ($i = $deco_len; $i > $idx; $i--) {
407 $decoded[$i] = $decoded[($i - 1)];
408 }
409 }
410 $decoded[$idx++] = $char;
411 }
412 return $this->_ucs4_to_utf8($decoded);
413 }
414
415 /**
416 * The actual encoding algorithm
417 * @access private
418 */
419 function _encode($decoded)
420 {
421 // We cannot encode a domain name containing the Punycode prefix
422 $extract = strlen($this->_punycode_prefix);
423 $check_pref = $this->_utf8_to_ucs4($this->_punycode_prefix);
424 $check_deco = array_slice($decoded, 0, $extract);
425
426 if ($check_pref == $check_deco) {
427 $this->_error('This is already a punycode string');
428 return false;
429 }
430 // We will not try to encode strings consisting of basic code points only
431 $encodable = false;
432 foreach ($decoded as $k => $v) {
433 if ($v > 0x7a) {
434 $encodable = true;
435 break;
436 }
437 }
438 if (!$encodable) {
439 $this->_error('The given string does not contain encodable chars');
440 return false;
441 }
442
443 // Do NAMEPREP
444 $decoded = $this->_nameprep($decoded);
445 if (!$decoded || !is_array($decoded)) return false; // NAMEPREP failed
446
447 $deco_len = count($decoded);
448 if (!$deco_len) return false; // Empty array
449
450 $codecount = 0; // How many chars have been consumed
451
452 $encoded = '';
453 // Copy all basic code points to output
454 for ($i = 0; $i < $deco_len; ++$i) {
455 $test = $decoded[$i];
456 // Will match [-0-9a-zA-Z]
457 if ((0x2F < $test && $test < 0x40) || (0x40 < $test && $test < 0x5B)
458 || (0x60 < $test && $test <= 0x7B) || (0x2D == $test)) {
459 $encoded .= chr($decoded[$i]);
460 $codecount++;
461 }
462 }
463 if ($codecount == $deco_len) return $encoded; // All codepoints were basic ones
464
465 // Start with the prefix; copy it to output
466 $encoded = $this->_punycode_prefix.$encoded;
467
468 // If we have basic code points in output, add an hyphen to the end
469 if ($codecount) $encoded .= '-';
470
471 // Now find and encode all non-basic code points
472 $is_first = true;
473 $cur_code = $this->_initial_n;
474 $bias = $this->_initial_bias;
475 $delta = 0;
476 while ($codecount < $deco_len) {
477 // Find the smallest code point >= the current code point and
478 // remember the last ouccrence of it in the input
479 for ($i = 0, $next_code = $this->_max_ucs; $i < $deco_len; $i++) {
480 if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) {
481 $next_code = $decoded[$i];
482 }
483 }
484
485 $delta += ($next_code - $cur_code) * ($codecount + 1);
486 $cur_code = $next_code;
487
488 // Scan input again and encode all characters whose code point is $cur_code
489 for ($i = 0; $i < $deco_len; $i++) {
490 if ($decoded[$i] < $cur_code) {
491 $delta++;
492 } elseif ($decoded[$i] == $cur_code) {
493 for ($q = $delta, $k = $this->_base; 1; $k += $this->_base) {
494 $t = ($k <= $bias) ? $this->_tmin :
495 (($k >= $bias + $this->_tmax) ? $this->_tmax : $k - $bias);
496 if ($q < $t) break;
497 $encoded .= $this->_encode_digit(intval($t + (($q - $t) % ($this->_base - $t)))); //v0.4.5 Changed from ceil() to intval()
498 $q = (int) (($q - $t) / ($this->_base - $t));
499 }
500 $encoded .= $this->_encode_digit($q);
501 $bias = $this->_adapt($delta, $codecount+1, $is_first);
502 $codecount++;
503 $delta = 0;
504 $is_first = false;
505 }
506 }
507 $delta++;
508 $cur_code++;
509 }
510 return $encoded;
511 }
512
513 /**
514 * Adapt the bias according to the current code point and position
515 * @access private
516 */
517 function _adapt($delta, $npoints, $is_first)
518 {
519 $delta = intval($is_first ? ($delta / $this->_damp) : ($delta / 2));
520 $delta += intval($delta / $npoints);
521 for ($k = 0; $delta > (($this->_base - $this->_tmin) * $this->_tmax) / 2; $k += $this->_base) {
522 $delta = intval($delta / ($this->_base - $this->_tmin));
523 }
524 return intval($k + ($this->_base - $this->_tmin + 1) * $delta / ($delta + $this->_skew));
525 }
526
527 /**
528 * Encoding a certain digit
529 * @access private
530 */
531 function _encode_digit($d)
532 {
533 return chr($d + 22 + 75 * ($d < 26));
534 }
535
536 /**
537 * Decode a certain digit
538 * @access private
539 */
540 function _decode_digit($cp)
541 {
542 $cp = ord($cp);
543 return ($cp - 48 < 10) ? $cp - 22 : (($cp - 65 < 26) ? $cp - 65 : (($cp - 97 < 26) ? $cp - 97 : $this->_base));
544 }
545
546 /**
547 * Internal error handling method
548 * @access private
549 */
550 function _error($error = '')
551 {
552 $this->_error = $error;
553 }
554
555 /**
556 * Do Nameprep according to RFC3491 and RFC3454
557 * @param array Unicode Characters
558 * @return string Unicode Characters, Nameprep'd
559 * @access private
560 */
561 function _nameprep($input)
562 {
563 $output = array();
564 $error = false;
565 //
566 // Mapping
567 // Walking through the input array, performing the required steps on each of
568 // the input chars and putting the result into the output array
569 // While mapping required chars we apply the cannonical ordering
570 foreach ($input as $v) {
571 // Map to nothing == skip that code point
572 if (in_array($v, $this->NP['map_nothing'])) continue;
573
574 // Try to find prohibited input
575 if (in_array($v, $this->NP['prohibit']) || in_array($v, $this->NP['general_prohibited'])) {
576 $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
577 return false;
578 }
579 foreach ($this->NP['prohibit_ranges'] as $range) {
580 if ($range[0] <= $v && $v <= $range[1]) {
581 $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
582 return false;
583 }
584 }
585 //
586 // Hangul syllable decomposition
587 if (0xAC00 <= $v && $v <= 0xD7AF) {
588 foreach ($this->_hangul_decompose($v) as $out) {
589 $output[] = (int) $out;
590 }
591 // There's a decomposition mapping for that code point
592 } elseif (isset($this->NP['replacemaps'][$v])) {
593 foreach ($this->_apply_cannonical_ordering($this->NP['replacemaps'][$v]) as $out) {
594 $output[] = (int) $out;
595 }
596 } else {
597 $output[] = (int) $v;
598 }
599 }
600 // Before applying any Combining, try to rearrange any Hangul syllables
601 $output = $this->_hangul_compose($output);
602 //
603 // Combine code points
604 //
605 $last_class = 0;
606 $last_starter = 0;
607 $out_len = count($output);
608 for ($i = 0; $i < $out_len; ++$i) {
609 $class = $this->_get_combining_class($output[$i]);
610 if ((!$last_class || $last_class > $class) && $class) {
611 // Try to match
612 $seq_len = $i - $last_starter;
613 $out = $this->_combine(array_slice($output, $last_starter, $seq_len));
614 // On match: Replace the last starter with the composed character and remove
615 // the now redundant non-starter(s)
616 if ($out) {
617 $output[$last_starter] = $out;
618 if (count($out) != $seq_len) {
619 for ($j = $i+1; $j < $out_len; ++$j) {
620 $output[$j-1] = $output[$j];
621 }
622 unset($output[$out_len]);
623 }
624 // Rewind the for loop by one, since there can be more possible compositions
625 $i--;
626 $out_len--;
627 $last_class = ($i == $last_starter) ? 0 : $this->_get_combining_class($output[$i-1]);
628 continue;
629 }
630 }
631 // The current class is 0
632 if (!$class) $last_starter = $i;
633 $last_class = $class;
634 }
635 return $output;
636 }
637
638 /**
639 * Decomposes a Hangul syllable
640 * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
641 * @param integer 32bit UCS4 code point
642 * @return array Either Hangul Syllable decomposed or original 32bit value as one value array
643 * @access private
644 */
645 function _hangul_decompose($char)
646 {
647 $sindex = (int) $char - $this->_sbase;
648 if ($sindex < 0 || $sindex >= $this->_scount) {
649 return array($char);
650 }
651 $result = array();
652 $result[] = (int) $this->_lbase + $sindex / $this->_ncount;
653 $result[] = (int) $this->_vbase + ($sindex % $this->_ncount) / $this->_tcount;
654 $T = intval($this->_tbase + $sindex % $this->_tcount);
655 if ($T != $this->_tbase) $result[] = $T;
656 return $result;
657 }
658 /**
659 * Ccomposes a Hangul syllable
660 * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
661 * @param array Decomposed UCS4 sequence
662 * @return array UCS4 sequence with syllables composed
663 * @access private
664 */
665 function _hangul_compose($input)
666 {
667 $inp_len = count($input);
668 if (!$inp_len) return array();
669 $result = array();
670 $last = (int) $input[0];
671 $result[] = $last; // copy first char from input to output
672
673 for ($i = 1; $i < $inp_len; ++$i) {
674 $char = (int) $input[$i];
675 $sindex = $last - $this->_sbase;
676 $lindex = $last - $this->_lbase;
677 $vindex = $char - $this->_vbase;
678 $tindex = $char - $this->_tbase;
679 // Find out, whether two current characters are LV and T
680 if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount == 0)
681 && 0 <= $tindex && $tindex <= $this->_tcount) {
682 // create syllable of form LVT
683 $last += $tindex;
684 $result[(count($result) - 1)] = $last; // reset last
685 continue; // discard char
686 }
687 // Find out, whether two current characters form L and V
688 if (0 <= $lindex && $lindex < $this->_lcount && 0 <= $vindex && $vindex < $this->_vcount) {
689 // create syllable of form LV
690 $last = (int) $this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount;
691 $result[(count($result) - 1)] = $last; // reset last
692 continue; // discard char
693 }
694 // if neither case was true, just add the character
695 $last = $char;
696 $result[] = $char;
697 }
698 return $result;
699 }
700
701 /**
702 * Returns the combining class of a certain wide char
703 * @param integer Wide char to check (32bit integer)
704 * @return integer Combining class if found, else 0
705 * @access private
706 */
707 function _get_combining_class($char)
708 {
709 return isset($this->NP['norm_combcls'][$char]) ? $this->NP['norm_combcls'][$char] : 0;
710 }
711
712 /**
713 * Apllies the cannonical ordering of a decomposed UCS4 sequence
714 * @param array Decomposed UCS4 sequence
715 * @return array Ordered USC4 sequence
716 * @access private
717 */
718 function _apply_cannonical_ordering($input)
719 {
720 $swap = true;
721 $size = count($input);
722 while ($swap) {
723 $swap = false;
724 $last = $this->_get_combining_class(intval($input[0]));
725 for ($i = 0; $i < $size-1; ++$i) {
726 $next = $this->_get_combining_class(intval($input[$i+1]));
727 if ($next != 0 && $last > $next) {
728 // Move item leftward until it fits
729 for ($j = $i + 1; $j > 0; --$j) {
730 if ($this->_get_combining_class(intval($input[$j-1])) <= $next) break;
731 $t = intval($input[$j]);
732 $input[$j] = intval($input[$j-1]);
733 $input[$j-1] = $t;
734 $swap = true;
735 }
736 // Reentering the loop looking at the old character again
737 $next = $last;
738 }
739 $last = $next;
740 }
741 }
742 return $input;
743 }
744
745 /**
746 * Do composition of a sequence of starter and non-starter
747 * @param array UCS4 Decomposed sequence
748 * @return array Ordered USC4 sequence
749 * @access private
750 */
751 function _combine($input)
752 {
753 $inp_len = count($input);
754 foreach ($this->NP['replacemaps'] as $np_src => $np_target) {
755 if ($np_target[0] != $input[0]) continue;
756 if (count($np_target) != $inp_len) continue;
757 $hit = false;
758 foreach ($input as $k2 => $v2) {
759 if ($v2 == $np_target[$k2]) {
760 $hit = true;
761 } else {
762 $hit = false;
763 break;
764 }
765 }
766 if ($hit) return $np_src;
767 }
768 return false;
769 }
770
771 /**
772 * This converts an UTF-8 encoded string to its UCS-4 representation
773 * By talking about UCS-4 "strings" we mean arrays of 32bit integers representing
774 * each of the "chars". This is due to PHP not being able to handle strings with
775 * bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too.
776 * The following UTF-8 encodings are supported:
777 * bytes bits representation
778 * 1 7 0xxxxxxx
779 * 2 11 110xxxxx 10xxxxxx
780 * 3 16 1110xxxx 10xxxxxx 10xxxxxx
781 * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
782 * 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
783 * 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
784 * Each x represents a bit that can be used to store character data.
785 * The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000
786 * @access private
787 */
788 function _utf8_to_ucs4($input)
789 {
790 $output = array();
791 $out_len = 0;
792 $inp_len = strlen($input);
793 $mode = 'next';
794 $test = 'none';
795 for ($k = 0; $k < $inp_len; ++$k) {
796 $v = ord($input{$k}); // Extract byte from input string
797
798 if ($v < 128) { // We found an ASCII char - put into stirng as is
799 $output[$out_len] = $v;
800 ++$out_len;
801 if ('add' == $mode) {
802 $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
803 return false;
804 }
805 continue;
806 }
807 if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
808 $start_byte = $v;
809 $mode = 'add';
810 $test = 'range';
811 if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
812 $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
813 $v = ($v - 192) << 6;
814 } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
815 $next_byte = 1;
816 $v = ($v - 224) << 12;
817 } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
818 $next_byte = 2;
819 $v = ($v - 240) << 18;
820 } elseif ($v >> 2 == 62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
821 $next_byte = 3;
822 $v = ($v - 248) << 24;
823 } elseif ($v >> 1 == 126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
824 $next_byte = 4;
825 $v = ($v - 252) << 30;
826 } else {
827 $this->_error('This might be UTF-8, but I don\'t understand it at byte '.$k);
828 return false;
829 }
830 if ('add' == $mode) {
831 $output[$out_len] = (int) $v;
832 ++$out_len;
833 continue;
834 }
835 }
836 if ('add' == $mode) {
837 if (!$this->_allow_overlong && $test == 'range') {
838 $test = 'none';
839 if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
840 $this->_error('Bogus UTF-8 character detected (out of legal range) at byte '.$k);
841 return false;
842 }
843 }
844 if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
845 $v = ($v - 128) << ($next_byte * 6);
846 $output[($out_len - 1)] += $v;
847 --$next_byte;
848 } else {
849 $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
850 return false;
851 }
852 if ($next_byte < 0) {
853 $mode = 'next';
854 }
855 }
856 } // for
857 return $output;
858 }
859
860 /**
861 * Convert UCS-4 string into UTF-8 string
862 * See _utf8_to_ucs4() for details
863 * @access private
864 */
865 function _ucs4_to_utf8($input)
866 {
867 $output = '';
868 $k = 0;
869 foreach ($input as $v) {
870 ++$k;
871 // $v = ord($v);
872 if ($v < 128) { // 7bit are transferred literally
873 $output .= chr($v);
874 } elseif ($v < (1 << 11)) { // 2 bytes
875 $output .= chr(192 + ($v >> 6)) . chr(128 + ($v & 63));
876 } elseif ($v < (1 << 16)) { // 3 bytes
877 $output .= chr(224 + ($v >> 12)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
878 } elseif ($v < (1 << 21)) { // 4 bytes
879 $output .= chr(240 + ($v >> 18)) . chr(128 + (($v >> 12) & 63))
880 . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
881 } elseif ($v < (1 << 26)) { // 5 bytes
882 $output .= chr(248 + ($v >> 24)) . chr(128 + (($v >> 18) & 63))
883 . chr(128 + (($v >> 12) & 63)) . chr(128 + (($v >> 6) & 63))
884 . chr(128 + ($v & 63));
885 } elseif ($v < (1 << 31)) { // 6 bytes
886 $output .= chr(252 + ($v >> 30)) . chr(128 + (($v >> 24) & 63))
887 . chr(128 + (($v >> 18) & 63)) . chr(128 + (($v >> 12) & 63))
888 . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
889 } else {
890 $this->_error('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k);
891 return false;
892 }
893 }
894 return $output;
895 }
896
897 /**
898 * Convert UCS-4 array into UCS-4 string
899 *
900 * @access private
901 */
902 function _ucs4_to_ucs4_string($input)
903 {
904 $output = '';
905 // Take array values and split output to 4 bytes per value
906 // The bit mask is 255, which reads &11111111
907 foreach ($input as $v) {
908 $output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255);
909 }
910 return $output;
911 }
912
913 /**
914 * Convert UCS-4 strin into UCS-4 garray
915 *
916 * @access private
917 */
918 function _ucs4_string_to_ucs4($input)
919 {
920 $output = array();
921 $inp_len = strlen($input);
922 // Input length must be dividable by 4
923 if ($inp_len % 4) {
924 $this->_error('Input UCS4 string is broken');
925 return false;
926 }
927 // Empty input - return empty output
928 if (!$inp_len) return $output;
929 for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
930 // Increment output position every 4 input bytes
931 if (!($i % 4)) {
932 $out_len++;
933 $output[$out_len] = 0;
934 }
935 $output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4) ) );
936 }
937 return $output;
938 }
939 }
940
941 /**
942 * Adapter class for aligning the API of idna_convert with that of Net_IDNA
943 * @author Matthias Sommerfeld <mso@phlylabs.de>
944 */
945 class Net_IDNA_php4 extends idna_convert
946 {
947 /**
948 * Sets a new option value. Available options and values:
949 * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
950 * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
951 * [overlong - Unicode does not allow unnecessarily long encodings of chars,
952 * to allow this, set this parameter to true, else to false;
953 * default is false.]
954 * [strict - true: strict mode, good for registration purposes - Causes errors
955 * on failures; false: loose mode, ideal for "wildlife" applications
956 * by silently ignoring errors and returning the original input instead
957 *
958 * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
959 * @param string Value to use (if parameter 1 is a string)
960 * @return boolean true on success, false otherwise
961 * @access public
962 */
963 function setParams($option, $param = false)
964 {
965 return $this->IC->set_parameters($option, $param);
966 }
967 }
968
969 ?>