1 <?php
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
44
45
46 47 48 49 50 51 52 53
54 class SimplePie_Decode_HTML_Entities
55 {
56 57 58 59 60 61
62 var $data = '';
63
64 65 66 67 68 69
70 var $consumed = '';
71
72 73 74 75 76 77
78 var $position = 0;
79
80 81 82 83 84 85
86 public function __construct($data)
87 {
88 $this->data = $data;
89 }
90
91 92 93 94 95 96
97 public function parse()
98 {
99 while (($this->position = strpos($this->data, '&', $this->position)) !== false)
100 {
101 $this->consume();
102 $this->entity();
103 $this->consumed = '';
104 }
105 return $this->data;
106 }
107
108 109 110 111 112 113
114 public function consume()
115 {
116 if (isset($this->data[$this->position]))
117 {
118 $this->consumed .= $this->data[$this->position];
119 return $this->data[$this->position++];
120 }
121 else
122 {
123 return false;
124 }
125 }
126
127 128 129 130 131 132 133
134 public function consume_range($chars)
135 {
136 if ($len = strspn($this->data, $chars, $this->position))
137 {
138 $data = substr($this->data, $this->position, $len);
139 $this->consumed .= $data;
140 $this->position += $len;
141 return $data;
142 }
143 else
144 {
145 return false;
146 }
147 }
148
149 150 151 152 153
154 public function unconsume()
155 {
156 $this->consumed = substr($this->consumed, 0, -1);
157 $this->position--;
158 }
159
160 161 162 163 164
165 public function entity()
166 {
167 switch ($this->consume())
168 {
169 case "\x09":
170 case "\x0A":
171 case "\x0B":
172 case "\x0B":
173 case "\x0C":
174 case "\x20":
175 case "\x3C":
176 case "\x26":
177 case false:
178 break;
179
180 case "\x23":
181 switch ($this->consume())
182 {
183 case "\x78":
184 case "\x58":
185 $range = '0123456789ABCDEFabcdef';
186 $hex = true;
187 break;
188
189 default:
190 $range = '0123456789';
191 $hex = false;
192 $this->unconsume();
193 break;
194 }
195
196 if ($codepoint = $this->consume_range($range))
197 {
198 static $windows_1252_specials = array(0x0D => "\x0A", 0x80 => "\xE2\x82\xAC", 0x81 => "\xEF\xBF\xBD", 0x82 => "\xE2\x80\x9A", 0x83 => "\xC6\x92", 0x84 => "\xE2\x80\x9E", 0x85 => "\xE2\x80\xA6", 0x86 => "\xE2\x80\xA0", 0x87 => "\xE2\x80\xA1", 0x88 => "\xCB\x86", 0x89 => "\xE2\x80\xB0", 0x8A => "\xC5\xA0", 0x8B => "\xE2\x80\xB9", 0x8C => "\xC5\x92", 0x8D => "\xEF\xBF\xBD", 0x8E => "\xC5\xBD", 0x8F => "\xEF\xBF\xBD", 0x90 => "\xEF\xBF\xBD", 0x91 => "\xE2\x80\x98", 0x92 => "\xE2\x80\x99", 0x93 => "\xE2\x80\x9C", 0x94 => "\xE2\x80\x9D", 0x95 => "\xE2\x80\xA2", 0x96 => "\xE2\x80\x93", 0x97 => "\xE2\x80\x94", 0x98 => "\xCB\x9C", 0x99 => "\xE2\x84\xA2", 0x9A => "\xC5\xA1", 0x9B => "\xE2\x80\xBA", 0x9C => "\xC5\x93", 0x9D => "\xEF\xBF\xBD", 0x9E => "\xC5\xBE", 0x9F => "\xC5\xB8");
199
200 if ($hex)
201 {
202 $codepoint = hexdec($codepoint);
203 }
204 else
205 {
206 $codepoint = intval($codepoint);
207 }
208
209 if (isset($windows_1252_specials[$codepoint]))
210 {
211 $replacement = $windows_1252_specials[$codepoint];
212 }
213 else
214 {
215 $replacement = SimplePie_Misc::codepoint_to_utf8($codepoint);
216 }
217
218 if (!in_array($this->consume(), array(';', false), true))
219 {
220 $this->unconsume();
221 }
222
223 $consumed_length = strlen($this->consumed);
224 $this->data = substr_replace($this->data, $replacement, $this->position - $consumed_length, $consumed_length);
225 $this->position += strlen($replacement) - $consumed_length;
226 }
227 break;
228
229 default:
230 static $entities = array(
231 'Aacute' => "\xC3\x81",
232 'aacute' => "\xC3\xA1",
233 'Aacute;' => "\xC3\x81",
234 'aacute;' => "\xC3\xA1",
235 'Acirc' => "\xC3\x82",
236 'acirc' => "\xC3\xA2",
237 'Acirc;' => "\xC3\x82",
238 'acirc;' => "\xC3\xA2",
239 'acute' => "\xC2\xB4",
240 'acute;' => "\xC2\xB4",
241 'AElig' => "\xC3\x86",
242 'aelig' => "\xC3\xA6",
243 'AElig;' => "\xC3\x86",
244 'aelig;' => "\xC3\xA6",
245 'Agrave' => "\xC3\x80",
246 'agrave' => "\xC3\xA0",
247 'Agrave;' => "\xC3\x80",
248 'agrave;' => "\xC3\xA0",
249 'alefsym;' => "\xE2\x84\xB5",
250 'Alpha;' => "\xCE\x91",
251 'alpha;' => "\xCE\xB1",
252 'AMP' => "\x26",
253 'amp' => "\x26",
254 'AMP;' => "\x26",
255 'amp;' => "\x26",
256 'and;' => "\xE2\x88\xA7",
257 'ang;' => "\xE2\x88\xA0",
258 'apos;' => "\x27",
259 'Aring' => "\xC3\x85",
260 'aring' => "\xC3\xA5",
261 'Aring;' => "\xC3\x85",
262 'aring;' => "\xC3\xA5",
263 'asymp;' => "\xE2\x89\x88",
264 'Atilde' => "\xC3\x83",
265 'atilde' => "\xC3\xA3",
266 'Atilde;' => "\xC3\x83",
267 'atilde;' => "\xC3\xA3",
268 'Auml' => "\xC3\x84",
269 'auml' => "\xC3\xA4",
270 'Auml;' => "\xC3\x84",
271 'auml;' => "\xC3\xA4",
272 'bdquo;' => "\xE2\x80\x9E",
273 'Beta;' => "\xCE\x92",
274 'beta;' => "\xCE\xB2",
275 'brvbar' => "\xC2\xA6",
276 'brvbar;' => "\xC2\xA6",
277 'bull;' => "\xE2\x80\xA2",
278 'cap;' => "\xE2\x88\xA9",
279 'Ccedil' => "\xC3\x87",
280 'ccedil' => "\xC3\xA7",
281 'Ccedil;' => "\xC3\x87",
282 'ccedil;' => "\xC3\xA7",
283 'cedil' => "\xC2\xB8",
284 'cedil;' => "\xC2\xB8",
285 'cent' => "\xC2\xA2",
286 'cent;' => "\xC2\xA2",
287 'Chi;' => "\xCE\xA7",
288 'chi;' => "\xCF\x87",
289 'circ;' => "\xCB\x86",
290 'clubs;' => "\xE2\x99\xA3",
291 'cong;' => "\xE2\x89\x85",
292 'COPY' => "\xC2\xA9",
293 'copy' => "\xC2\xA9",
294 'COPY;' => "\xC2\xA9",
295 'copy;' => "\xC2\xA9",
296 'crarr;' => "\xE2\x86\xB5",
297 'cup;' => "\xE2\x88\xAA",
298 'curren' => "\xC2\xA4",
299 'curren;' => "\xC2\xA4",
300 'Dagger;' => "\xE2\x80\xA1",
301 'dagger;' => "\xE2\x80\xA0",
302 'dArr;' => "\xE2\x87\x93",
303 'darr;' => "\xE2\x86\x93",
304 'deg' => "\xC2\xB0",
305 'deg;' => "\xC2\xB0",
306 'Delta;' => "\xCE\x94",
307 'delta;' => "\xCE\xB4",
308 'diams;' => "\xE2\x99\xA6",
309 'divide' => "\xC3\xB7",
310 'divide;' => "\xC3\xB7",
311 'Eacute' => "\xC3\x89",
312 'eacute' => "\xC3\xA9",
313 'Eacute;' => "\xC3\x89",
314 'eacute;' => "\xC3\xA9",
315 'Ecirc' => "\xC3\x8A",
316 'ecirc' => "\xC3\xAA",
317 'Ecirc;' => "\xC3\x8A",
318 'ecirc;' => "\xC3\xAA",
319 'Egrave' => "\xC3\x88",
320 'egrave' => "\xC3\xA8",
321 'Egrave;' => "\xC3\x88",
322 'egrave;' => "\xC3\xA8",
323 'empty;' => "\xE2\x88\x85",
324 'emsp;' => "\xE2\x80\x83",
325 'ensp;' => "\xE2\x80\x82",
326 'Epsilon;' => "\xCE\x95",
327 'epsilon;' => "\xCE\xB5",
328 'equiv;' => "\xE2\x89\xA1",
329 'Eta;' => "\xCE\x97",
330 'eta;' => "\xCE\xB7",
331 'ETH' => "\xC3\x90",
332 'eth' => "\xC3\xB0",
333 'ETH;' => "\xC3\x90",
334 'eth;' => "\xC3\xB0",
335 'Euml' => "\xC3\x8B",
336 'euml' => "\xC3\xAB",
337 'Euml;' => "\xC3\x8B",
338 'euml;' => "\xC3\xAB",
339 'euro;' => "\xE2\x82\xAC",
340 'exist;' => "\xE2\x88\x83",
341 'fnof;' => "\xC6\x92",
342 'forall;' => "\xE2\x88\x80",
343 'frac12' => "\xC2\xBD",
344 'frac12;' => "\xC2\xBD",
345 'frac14' => "\xC2\xBC",
346 'frac14;' => "\xC2\xBC",
347 'frac34' => "\xC2\xBE",
348 'frac34;' => "\xC2\xBE",
349 'frasl;' => "\xE2\x81\x84",
350 'Gamma;' => "\xCE\x93",
351 'gamma;' => "\xCE\xB3",
352 'ge;' => "\xE2\x89\xA5",
353 'GT' => "\x3E",
354 'gt' => "\x3E",
355 'GT;' => "\x3E",
356 'gt;' => "\x3E",
357 'hArr;' => "\xE2\x87\x94",
358 'harr;' => "\xE2\x86\x94",
359 'hearts;' => "\xE2\x99\xA5",
360 'hellip;' => "\xE2\x80\xA6",
361 'Iacute' => "\xC3\x8D",
362 'iacute' => "\xC3\xAD",
363 'Iacute;' => "\xC3\x8D",
364 'iacute;' => "\xC3\xAD",
365 'Icirc' => "\xC3\x8E",
366 'icirc' => "\xC3\xAE",
367 'Icirc;' => "\xC3\x8E",
368 'icirc;' => "\xC3\xAE",
369 'iexcl' => "\xC2\xA1",
370 'iexcl;' => "\xC2\xA1",
371 'Igrave' => "\xC3\x8C",
372 'igrave' => "\xC3\xAC",
373 'Igrave;' => "\xC3\x8C",
374 'igrave;' => "\xC3\xAC",
375 'image;' => "\xE2\x84\x91",
376 'infin;' => "\xE2\x88\x9E",
377 'int;' => "\xE2\x88\xAB",
378 'Iota;' => "\xCE\x99",
379 'iota;' => "\xCE\xB9",
380 'iquest' => "\xC2\xBF",
381 'iquest;' => "\xC2\xBF",
382 'isin;' => "\xE2\x88\x88",
383 'Iuml' => "\xC3\x8F",
384 'iuml' => "\xC3\xAF",
385 'Iuml;' => "\xC3\x8F",
386 'iuml;' => "\xC3\xAF",
387 'Kappa;' => "\xCE\x9A",
388 'kappa;' => "\xCE\xBA",
389 'Lambda;' => "\xCE\x9B",
390 'lambda;' => "\xCE\xBB",
391 'lang;' => "\xE3\x80\x88",
392 'laquo' => "\xC2\xAB",
393 'laquo;' => "\xC2\xAB",
394 'lArr;' => "\xE2\x87\x90",
395 'larr;' => "\xE2\x86\x90",
396 'lceil;' => "\xE2\x8C\x88",
397 'ldquo;' => "\xE2\x80\x9C",
398 'le;' => "\xE2\x89\xA4",
399 'lfloor;' => "\xE2\x8C\x8A",
400 'lowast;' => "\xE2\x88\x97",
401 'loz;' => "\xE2\x97\x8A",
402 'lrm;' => "\xE2\x80\x8E",
403 'lsaquo;' => "\xE2\x80\xB9",
404 'lsquo;' => "\xE2\x80\x98",
405 'LT' => "\x3C",
406 'lt' => "\x3C",
407 'LT;' => "\x3C",
408 'lt;' => "\x3C",
409 'macr' => "\xC2\xAF",
410 'macr;' => "\xC2\xAF",
411 'mdash;' => "\xE2\x80\x94",
412 'micro' => "\xC2\xB5",
413 'micro;' => "\xC2\xB5",
414 'middot' => "\xC2\xB7",
415 'middot;' => "\xC2\xB7",
416 'minus;' => "\xE2\x88\x92",
417 'Mu;' => "\xCE\x9C",
418 'mu;' => "\xCE\xBC",
419 'nabla;' => "\xE2\x88\x87",
420 'nbsp' => "\xC2\xA0",
421 'nbsp;' => "\xC2\xA0",
422 'ndash;' => "\xE2\x80\x93",
423 'ne;' => "\xE2\x89\xA0",
424 'ni;' => "\xE2\x88\x8B",
425 'not' => "\xC2\xAC",
426 'not;' => "\xC2\xAC",
427 'notin;' => "\xE2\x88\x89",
428 'nsub;' => "\xE2\x8A\x84",
429 'Ntilde' => "\xC3\x91",
430 'ntilde' => "\xC3\xB1",
431 'Ntilde;' => "\xC3\x91",
432 'ntilde;' => "\xC3\xB1",
433 'Nu;' => "\xCE\x9D",
434 'nu;' => "\xCE\xBD",
435 'Oacute' => "\xC3\x93",
436 'oacute' => "\xC3\xB3",
437 'Oacute;' => "\xC3\x93",
438 'oacute;' => "\xC3\xB3",
439 'Ocirc' => "\xC3\x94",
440 'ocirc' => "\xC3\xB4",
441 'Ocirc;' => "\xC3\x94",
442 'ocirc;' => "\xC3\xB4",
443 'OElig;' => "\xC5\x92",
444 'oelig;' => "\xC5\x93",
445 'Ograve' => "\xC3\x92",
446 'ograve' => "\xC3\xB2",
447 'Ograve;' => "\xC3\x92",
448 'ograve;' => "\xC3\xB2",
449 'oline;' => "\xE2\x80\xBE",
450 'Omega;' => "\xCE\xA9",
451 'omega;' => "\xCF\x89",
452 'Omicron;' => "\xCE\x9F",
453 'omicron;' => "\xCE\xBF",
454 'oplus;' => "\xE2\x8A\x95",
455 'or;' => "\xE2\x88\xA8",
456 'ordf' => "\xC2\xAA",
457 'ordf;' => "\xC2\xAA",
458 'ordm' => "\xC2\xBA",
459 'ordm;' => "\xC2\xBA",
460 'Oslash' => "\xC3\x98",
461 'oslash' => "\xC3\xB8",
462 'Oslash;' => "\xC3\x98",
463 'oslash;' => "\xC3\xB8",
464 'Otilde' => "\xC3\x95",
465 'otilde' => "\xC3\xB5",
466 'Otilde;' => "\xC3\x95",
467 'otilde;' => "\xC3\xB5",
468 'otimes;' => "\xE2\x8A\x97",
469 'Ouml' => "\xC3\x96",
470 'ouml' => "\xC3\xB6",
471 'Ouml;' => "\xC3\x96",
472 'ouml;' => "\xC3\xB6",
473 'para' => "\xC2\xB6",
474 'para;' => "\xC2\xB6",
475 'part;' => "\xE2\x88\x82",
476 'permil;' => "\xE2\x80\xB0",
477 'perp;' => "\xE2\x8A\xA5",
478 'Phi;' => "\xCE\xA6",
479 'phi;' => "\xCF\x86",
480 'Pi;' => "\xCE\xA0",
481 'pi;' => "\xCF\x80",
482 'piv;' => "\xCF\x96",
483 'plusmn' => "\xC2\xB1",
484 'plusmn;' => "\xC2\xB1",
485 'pound' => "\xC2\xA3",
486 'pound;' => "\xC2\xA3",
487 'Prime;' => "\xE2\x80\xB3",
488 'prime;' => "\xE2\x80\xB2",
489 'prod;' => "\xE2\x88\x8F",
490 'prop;' => "\xE2\x88\x9D",
491 'Psi;' => "\xCE\xA8",
492 'psi;' => "\xCF\x88",
493 'QUOT' => "\x22",
494 'quot' => "\x22",
495 'QUOT;' => "\x22",
496 'quot;' => "\x22",
497 'radic;' => "\xE2\x88\x9A",
498 'rang;' => "\xE3\x80\x89",
499 'raquo' => "\xC2\xBB",
500 'raquo;' => "\xC2\xBB",
501 'rArr;' => "\xE2\x87\x92",
502 'rarr;' => "\xE2\x86\x92",
503 'rceil;' => "\xE2\x8C\x89",
504 'rdquo;' => "\xE2\x80\x9D",
505 'real;' => "\xE2\x84\x9C",
506 'REG' => "\xC2\xAE",
507 'reg' => "\xC2\xAE",
508 'REG;' => "\xC2\xAE",
509 'reg;' => "\xC2\xAE",
510 'rfloor;' => "\xE2\x8C\x8B",
511 'Rho;' => "\xCE\xA1",
512 'rho;' => "\xCF\x81",
513 'rlm;' => "\xE2\x80\x8F",
514 'rsaquo;' => "\xE2\x80\xBA",
515 'rsquo;' => "\xE2\x80\x99",
516 'sbquo;' => "\xE2\x80\x9A",
517 'Scaron;' => "\xC5\xA0",
518 'scaron;' => "\xC5\xA1",
519 'sdot;' => "\xE2\x8B\x85",
520 'sect' => "\xC2\xA7",
521 'sect;' => "\xC2\xA7",
522 'shy' => "\xC2\xAD",
523 'shy;' => "\xC2\xAD",
524 'Sigma;' => "\xCE\xA3",
525 'sigma;' => "\xCF\x83",
526 'sigmaf;' => "\xCF\x82",
527 'sim;' => "\xE2\x88\xBC",
528 'spades;' => "\xE2\x99\xA0",
529 'sub;' => "\xE2\x8A\x82",
530 'sube;' => "\xE2\x8A\x86",
531 'sum;' => "\xE2\x88\x91",
532 'sup;' => "\xE2\x8A\x83",
533 'sup1' => "\xC2\xB9",
534 'sup1;' => "\xC2\xB9",
535 'sup2' => "\xC2\xB2",
536 'sup2;' => "\xC2\xB2",
537 'sup3' => "\xC2\xB3",
538 'sup3;' => "\xC2\xB3",
539 'supe;' => "\xE2\x8A\x87",
540 'szlig' => "\xC3\x9F",
541 'szlig;' => "\xC3\x9F",
542 'Tau;' => "\xCE\xA4",
543 'tau;' => "\xCF\x84",
544 'there4;' => "\xE2\x88\xB4",
545 'Theta;' => "\xCE\x98",
546 'theta;' => "\xCE\xB8",
547 'thetasym;' => "\xCF\x91",
548 'thinsp;' => "\xE2\x80\x89",
549 'THORN' => "\xC3\x9E",
550 'thorn' => "\xC3\xBE",
551 'THORN;' => "\xC3\x9E",
552 'thorn;' => "\xC3\xBE",
553 'tilde;' => "\xCB\x9C",
554 'times' => "\xC3\x97",
555 'times;' => "\xC3\x97",
556 'TRADE;' => "\xE2\x84\xA2",
557 'trade;' => "\xE2\x84\xA2",
558 'Uacute' => "\xC3\x9A",
559 'uacute' => "\xC3\xBA",
560 'Uacute;' => "\xC3\x9A",
561 'uacute;' => "\xC3\xBA",
562 'uArr;' => "\xE2\x87\x91",
563 'uarr;' => "\xE2\x86\x91",
564 'Ucirc' => "\xC3\x9B",
565 'ucirc' => "\xC3\xBB",
566 'Ucirc;' => "\xC3\x9B",
567 'ucirc;' => "\xC3\xBB",
568 'Ugrave' => "\xC3\x99",
569 'ugrave' => "\xC3\xB9",
570 'Ugrave;' => "\xC3\x99",
571 'ugrave;' => "\xC3\xB9",
572 'uml' => "\xC2\xA8",
573 'uml;' => "\xC2\xA8",
574 'upsih;' => "\xCF\x92",
575 'Upsilon;' => "\xCE\xA5",
576 'upsilon;' => "\xCF\x85",
577 'Uuml' => "\xC3\x9C",
578 'uuml' => "\xC3\xBC",
579 'Uuml;' => "\xC3\x9C",
580 'uuml;' => "\xC3\xBC",
581 'weierp;' => "\xE2\x84\x98",
582 'Xi;' => "\xCE\x9E",
583 'xi;' => "\xCE\xBE",
584 'Yacute' => "\xC3\x9D",
585 'yacute' => "\xC3\xBD",
586 'Yacute;' => "\xC3\x9D",
587 'yacute;' => "\xC3\xBD",
588 'yen' => "\xC2\xA5",
589 'yen;' => "\xC2\xA5",
590 'yuml' => "\xC3\xBF",
591 'Yuml;' => "\xC5\xB8",
592 'yuml;' => "\xC3\xBF",
593 'Zeta;' => "\xCE\x96",
594 'zeta;' => "\xCE\xB6",
595 'zwj;' => "\xE2\x80\x8D",
596 'zwnj;' => "\xE2\x80\x8C"
597 );
598
599 for ($i = 0, $match = null; $i < 9 && $this->consume() !== false; $i++)
600 {
601 $consumed = substr($this->consumed, 1);
602 if (isset($entities[$consumed]))
603 {
604 $match = $consumed;
605 }
606 }
607
608 if ($match !== null)
609 {
610 $this->data = substr_replace($this->data, $entities[$match], $this->position - strlen($consumed) - 1, strlen($match) + 1);
611 $this->position += strlen($entities[$match]) - strlen($consumed) - 1;
612 }
613 break;
614 }
615 }
616 }
617
618