1 <?php
2 /**
3 * @package Joomla.Platform
4 * @subpackage Filter
5 *
6 * @copyright Copyright (C) 2005 - 2017 Open Source Matters, Inc. All rights reserved.
7 * @license GNU General Public License version 2 or later; see LICENSE
8 */
9
10 defined('JPATH_PLATFORM') or die;
11
12 use Joomla\Filter\InputFilter;
13 use Joomla\String\StringHelper;
14
15 /**
16 * JFilterInput is a class for filtering input from any data source
17 *
18 * Forked from the php input filter library by: Daniel Morris <dan@rootcube.com>
19 * Original Contributors: Gianpaolo Racca, Ghislain Picard, Marco Wandschneider, Chris Tobin and Andrew Eddie.
20 *
21 * @since 11.1
22 */
23 class JFilterInput extends InputFilter
24 {
25 /**
26 * A flag for Unicode Supplementary Characters (4-byte Unicode character) stripping.
27 *
28 * @var integer
29 *
30 * @since 3.5
31 */
32 public $stripUSC = 0;
33
34 /**
35 * Constructor for inputFilter class. Only first parameter is required.
36 *
37 * @param array $tagsArray List of user-defined tags
38 * @param array $attrArray List of user-defined attributes
39 * @param integer $tagsMethod WhiteList method = 0, BlackList method = 1
40 * @param integer $attrMethod WhiteList method = 0, BlackList method = 1
41 * @param integer $xssAuto Only auto clean essentials = 0, Allow clean blacklisted tags/attr = 1
42 * @param integer $stripUSC Strip 4-byte unicode characters = 1, no strip = 0, ask the database driver = -1
43 *
44 * @since 11.1
45 */
46 public function __construct($tagsArray = array(), $attrArray = array(), $tagsMethod = 0, $attrMethod = 0, $xssAuto = 1, $stripUSC = -1)
47 {
48 // Make sure user defined arrays are in lowercase
49 $tagsArray = array_map('strtolower', (array) $tagsArray);
50 $attrArray = array_map('strtolower', (array) $attrArray);
51
52 // Assign member variables
53 $this->tagsArray = $tagsArray;
54 $this->attrArray = $attrArray;
55 $this->tagsMethod = $tagsMethod;
56 $this->attrMethod = $attrMethod;
57 $this->xssAuto = $xssAuto;
58 $this->stripUSC = $stripUSC;
59 /**
60 * If Unicode Supplementary Characters stripping is not set we have to check with the database driver. If the
61 * driver does not support USCs (i.e. there is no utf8mb4 support) we will enable USC stripping.
62 */
63 if ($this->stripUSC === -1)
64 {
65 try
66 {
67 // Get the database driver
68 $db = JFactory::getDbo();
69
70 // This trick is required to let the driver determine the utf-8 multibyte support
71 $db->connect();
72
73 // And now we can decide if we should strip USCs
74 $this->stripUSC = $db->hasUTF8mb4Support() ? 0 : 1;
75 }
76 catch (RuntimeException $e)
77 {
78 // Could not connect to MySQL. Strip USC to be on the safe side.
79 $this->stripUSC = 1;
80 }
81 }
82 }
83
84 /**
85 * Returns an input filter object, only creating it if it doesn't already exist.
86 *
87 * @param array $tagsArray List of user-defined tags
88 * @param array $attrArray List of user-defined attributes
89 * @param integer $tagsMethod WhiteList method = 0, BlackList method = 1
90 * @param integer $attrMethod WhiteList method = 0, BlackList method = 1
91 * @param integer $xssAuto Only auto clean essentials = 0, Allow clean blacklisted tags/attr = 1
92 * @param integer $stripUSC Strip 4-byte unicode characters = 1, no strip = 0, ask the database driver = -1
93 *
94 * @return JFilterInput The JFilterInput object.
95 *
96 * @since 11.1
97 */
98 public static function &getInstance($tagsArray = array(), $attrArray = array(), $tagsMethod = 0, $attrMethod = 0, $xssAuto = 1, $stripUSC = -1)
99 {
100 $sig = md5(serialize(array($tagsArray, $attrArray, $tagsMethod, $attrMethod, $xssAuto)));
101
102 if (empty(self::$instances[$sig]))
103 {
104 self::$instances[$sig] = new JFilterInput($tagsArray, $attrArray, $tagsMethod, $attrMethod, $xssAuto, $stripUSC);
105 }
106
107 return self::$instances[$sig];
108 }
109
110 /**
111 * Method to be called by another php script. Processes for XSS and
112 * specified bad code.
113 *
114 * @param mixed $source Input string/array-of-string to be 'cleaned'
115 * @param string $type The return type for the variable:
116 * INT: An integer, or an array of integers,
117 * UINT: An unsigned integer, or an array of unsigned integers,
118 * FLOAT: A floating point number, or an array of floating point numbers,
119 * BOOLEAN: A boolean value,
120 * WORD: A string containing A-Z or underscores only (not case sensitive),
121 * ALNUM: A string containing A-Z or 0-9 only (not case sensitive),
122 * CMD: A string containing A-Z, 0-9, underscores, periods or hyphens (not case sensitive),
123 * BASE64: A string containing A-Z, 0-9, forward slashes, plus or equals (not case sensitive),
124 * STRING: A fully decoded and sanitised string (default),
125 * HTML: A sanitised string,
126 * ARRAY: An array,
127 * PATH: A sanitised file path, or an array of sanitised file paths,
128 * TRIM: A string trimmed from normal, non-breaking and multibyte spaces
129 * USERNAME: Do not use (use an application specific filter),
130 * RAW: The raw string is returned with no filtering,
131 * unknown: An unknown filter will act like STRING. If the input is an array it will return an
132 * array of fully decoded and sanitised strings.
133 *
134 * @return mixed 'Cleaned' version of input parameter
135 *
136 * @since 11.1
137 */
138 public function clean($source, $type = 'string')
139 {
140 // Strip Unicode Supplementary Characters when requested to do so
141 if ($this->stripUSC)
142 {
143 // Alternatively: preg_replace('/[\x{10000}-\x{10FFFF}]/u', "\xE2\xAF\x91", $source) but it'd be slower.
144 $source = $this->stripUSC($source);
145 }
146
147 // Handle the type constraint cases
148 switch (strtoupper($type))
149 {
150 case 'INT':
151 case 'INTEGER':
152 $pattern = '/[-+]?[0-9]+/';
153
154 if (is_array($source))
155 {
156 $result = array();
157
158 // Itterate through the array
159 foreach ($source as $eachString)
160 {
161 preg_match($pattern, (string) $eachString, $matches);
162 $result[] = isset($matches[0]) ? (int) $matches[0] : 0;
163 }
164 }
165 else
166 {
167 preg_match($pattern, (string) $source, $matches);
168 $result = isset($matches[0]) ? (int) $matches[0] : 0;
169 }
170
171 break;
172 case 'UINT':
173 $pattern = '/[-+]?[0-9]+/';
174
175 if (is_array($source))
176 {
177 $result = array();
178
179 // Itterate through the array
180 foreach ($source as $eachString)
181 {
182 preg_match($pattern, (string) $eachString, $matches);
183 $result[] = isset($matches[0]) ? abs((int) $matches[0]) : 0;
184 }
185 }
186 else
187 {
188 preg_match($pattern, (string) $source, $matches);
189 $result = isset($matches[0]) ? abs((int) $matches[0]) : 0;
190 }
191
192 break;
193 case 'FLOAT':
194 case 'DOUBLE':
195 $pattern = '/[-+]?[0-9]+(\.[0-9]+)?([eE][-+]?[0-9]+)?/';
196
197 if (is_array($source))
198 {
199 $result = array();
200
201 // Itterate through the array
202 foreach ($source as $eachString)
203 {
204 preg_match($pattern, (string) $eachString, $matches);
205 $result[] = isset($matches[0]) ? (float) $matches[0] : 0;
206 }
207 }
208 else
209 {
210 preg_match($pattern, (string) $source, $matches);
211 $result = isset($matches[0]) ? (float) $matches[0] : 0;
212 }
213
214 break;
215 case 'BOOL':
216 case 'BOOLEAN':
217
218 if (is_array($source))
219 {
220 $result = array();
221
222 // Iterate through the array
223 foreach ($source as $eachString)
224 {
225 $result[] = (bool) $eachString;
226 }
227 }
228 else
229 {
230 $result = (bool) $source;
231 }
232
233 break;
234 case 'WORD':
235 $pattern = '/[^A-Z_]/i';
236
237 if (is_array($source))
238 {
239 $result = array();
240
241 // Iterate through the array
242 foreach ($source as $eachString)
243 {
244 $result[] = (string) preg_replace($pattern, '', $eachString);
245 }
246 }
247 else
248 {
249 $result = (string) preg_replace($pattern, '', $source);
250 }
251
252 break;
253 case 'ALNUM':
254 $pattern = '/[^A-Z0-9]/i';
255
256 if (is_array($source))
257 {
258 $result = array();
259
260 // Iterate through the array
261 foreach ($source as $eachString)
262 {
263 $result[] = (string) preg_replace($pattern, '', $eachString);
264 }
265 }
266 else
267 {
268 $result = (string) preg_replace($pattern, '', $source);
269 }
270
271 break;
272 case 'CMD':
273 $pattern = '/[^A-Z0-9_\.-]/i';
274
275 if (is_array($source))
276 {
277 $result = array();
278
279 // Iterate through the array
280 foreach ($source as $eachString)
281 {
282 $cleaned = (string) preg_replace($pattern, '', $eachString);
283 $result[] = ltrim($cleaned, '.');
284 }
285 }
286 else
287 {
288 $result = (string) preg_replace($pattern, '', $source);
289 $result = ltrim($result, '.');
290 }
291
292 break;
293 case 'BASE64':
294 $pattern = '/[^A-Z0-9\/+=]/i';
295
296 if (is_array($source))
297 {
298 $result = array();
299
300 // Iterate through the array
301 foreach ($source as $eachString)
302 {
303 $result[] = (string) preg_replace($pattern, '', $eachString);
304 }
305 }
306 else
307 {
308 $result = (string) preg_replace($pattern, '', $source);
309 }
310
311 break;
312 case 'STRING':
313
314 if (is_array($source))
315 {
316 $result = array();
317
318 // Iterate through the array
319 foreach ($source as $eachString)
320 {
321 $result[] = (string) $this->remove($this->decode((string) $eachString));
322 }
323 }
324 else
325 {
326 $result = (string) $this->remove($this->decode((string) $source));
327 }
328
329 break;
330 case 'HTML':
331
332 if (is_array($source))
333 {
334 $result = array();
335
336 // Iterate through the array
337 foreach ($source as $eachString)
338 {
339 $result[] = (string) $this->remove((string) $eachString);
340 }
341 }
342 else
343 {
344 $result = (string) $this->remove((string) $source);
345 }
346
347 break;
348 case 'ARRAY':
349 $result = (array) $source;
350
351 break;
352 case 'PATH':
353 $pattern = '/^[A-Za-z0-9_\/-]+[A-Za-z0-9_\.-]*([\\\\\/][A-Za-z0-9_-]+[A-Za-z0-9_\.-]*)*$/';
354
355 if (is_array($source))
356 {
357 $result = array();
358
359 // Itterate through the array
360 foreach ($source as $eachString)
361 {
362 preg_match($pattern, (string) $eachString, $matches);
363 $result[] = isset($matches[0]) ? (string) $matches[0] : '';
364 }
365 }
366 else
367 {
368 preg_match($pattern, $source, $matches);
369 $result = isset($matches[0]) ? (string) $matches[0] : '';
370 }
371
372 break;
373 case 'TRIM':
374
375 if (is_array($source))
376 {
377 $result = array();
378
379 // Iterate through the array
380 foreach ($source as $eachString)
381 {
382 $cleaned = (string) trim($eachString);
383 $cleaned = StringHelper::trim($cleaned, chr(0xE3) . chr(0x80) . chr(0x80));
384 $result[] = StringHelper::trim($cleaned, chr(0xC2) . chr(0xA0));
385 }
386 }
387 else
388 {
389 $result = (string) trim($source);
390 $result = StringHelper::trim($result, chr(0xE3) . chr(0x80) . chr(0x80));
391 $result = StringHelper::trim($result, chr(0xC2) . chr(0xA0));
392 }
393
394 break;
395 case 'USERNAME':
396 $pattern = '/[\x00-\x1F\x7F<>"\'%&]/';
397
398 if (is_array($source))
399 {
400 $result = array();
401
402 // Iterate through the array
403 foreach ($source as $eachString)
404 {
405 $result[] = (string) preg_replace($pattern, '', $eachString);
406 }
407 }
408 else
409 {
410 $result = (string) preg_replace($pattern, '', $source);
411 }
412
413 break;
414 case 'RAW':
415 $result = $source;
416
417 break;
418 default:
419
420 // Are we dealing with an array?
421 if (is_array($source))
422 {
423 foreach ($source as $key => $value)
424 {
425 // Filter element for XSS and other 'bad' code etc.
426 if (is_string($value))
427 {
428 $source[$key] = $this->_remove($this->_decode($value));
429 }
430 }
431 $result = $source;
432 }
433 else
434 {
435 // Or a string?
436 if (is_string($source) && !empty($source))
437 {
438 // Filter source for XSS and other 'bad' code etc.
439 $result = $this->_remove($this->_decode($source));
440 }
441 else
442 {
443 // Not an array or string... return the passed parameter
444 $result = $source;
445 }
446 }
447
448 break;
449 }
450
451 return $result;
452 }
453
454 /**
455 * Function to punyencode utf8 mail when saving content
456 *
457 * @param string $text The strings to encode
458 *
459 * @return string The punyencoded mail
460 *
461 * @since 3.5
462 */
463 public function emailToPunycode($text)
464 {
465 $pattern = '/(("mailto:)+[\w\.\-\+]+\@[^"?]+\.+[^."?]+("|\?))/';
466
467 if (preg_match_all($pattern, $text, $matches))
468 {
469 foreach ($matches[0] as $match)
470 {
471 $match = (string) str_replace(array('?', '"'), '', $match);
472 $text = (string) str_replace($match, JStringPunycode::emailToPunycode($match), $text);
473 }
474 }
475
476 return $text;
477 }
478
479 /**
480 * Checks an uploaded for suspicious naming and potential PHP contents which could indicate a hacking attempt.
481 *
482 * The options you can define are:
483 * null_byte Prevent files with a null byte in their name (buffer overflow attack)
484 * forbidden_extensions Do not allow these strings anywhere in the file's extension
485 * php_tag_in_content Do not allow `<?php` tag in content
486 * shorttag_in_content Do not allow short tag `<?` in content
487 * shorttag_extensions Which file extensions to scan for short tags in content
488 * fobidden_ext_in_content Do not allow forbidden_extensions anywhere in content
489 * php_ext_content_extensions Which file extensions to scan for .php in content
490 *
491 * This code is an adaptation and improvement of Admin Tools' UploadShield feature,
492 * relicensed and contributed by its author.
493 *
494 * @param array $file An uploaded file descriptor
495 * @param array $options The scanner options (see the code for details)
496 *
497 * @return boolean True of the file is safe
498 *
499 * @since 3.4
500 */
501 public static function isSafeFile($file, $options = array())
502 {
503 $defaultOptions = array(
504
505 // Null byte in file name
506 'null_byte' => true,
507
508 // Forbidden string in extension (e.g. php matched .php, .xxx.php, .php.xxx and so on)
509 'forbidden_extensions' => array(
510 'php', 'phps', 'pht', 'phtml', 'php3', 'php4', 'php5', 'php6', 'php7', 'inc', 'pl', 'cgi', 'fcgi', 'java', 'jar', 'py',
511 ),
512
513 // <?php tag in file contents
514 'php_tag_in_content' => true,
515
516 // <? tag in file contents
517 'shorttag_in_content' => true,
518
519 // Which file extensions to scan for short tags
520 'shorttag_extensions' => array(
521 'inc', 'phps', 'class', 'php3', 'php4', 'php5', 'txt', 'dat', 'tpl', 'tmpl',
522 ),
523
524 // Forbidden extensions anywhere in the content
525 'fobidden_ext_in_content' => true,
526
527 // Which file extensions to scan for .php in the content
528 'php_ext_content_extensions' => array('zip', 'rar', 'tar', 'gz', 'tgz', 'bz2', 'tbz', 'jpa'),
529 );
530
531 $options = array_merge($defaultOptions, $options);
532
533 // Make sure we can scan nested file descriptors
534 $descriptors = $file;
535
536 if (isset($file['name']) && isset($file['tmp_name']))
537 {
538 $descriptors = self::decodeFileData(
539 array(
540 $file['name'],
541 $file['type'],
542 $file['tmp_name'],
543 $file['error'],
544 $file['size'],
545 )
546 );
547 }
548
549 // Handle non-nested descriptors (single files)
550 if (isset($descriptors['name']))
551 {
552 $descriptors = array($descriptors);
553 }
554
555 // Scan all descriptors detected
556 foreach ($descriptors as $fileDescriptor)
557 {
558 if (!isset($fileDescriptor['name']))
559 {
560 // This is a nested descriptor. We have to recurse.
561 if (!self::isSafeFile($fileDescriptor, $options))
562 {
563 return false;
564 }
565
566 continue;
567 }
568
569 $tempNames = $fileDescriptor['tmp_name'];
570 $intendedNames = $fileDescriptor['name'];
571
572 if (!is_array($tempNames))
573 {
574 $tempNames = array($tempNames);
575 }
576
577 if (!is_array($intendedNames))
578 {
579 $intendedNames = array($intendedNames);
580 }
581
582 $len = count($tempNames);
583
584 for ($i = 0; $i < $len; $i++)
585 {
586 $tempName = array_shift($tempNames);
587 $intendedName = array_shift($intendedNames);
588
589 // 1. Null byte check
590 if ($options['null_byte'])
591 {
592 if (strstr($intendedName, "\x00"))
593 {
594 return false;
595 }
596 }
597
598 // 2. PHP-in-extension check (.php, .php.xxx[.yyy[.zzz[...]]], .xxx[.yyy[.zzz[...]]].php)
599 if (!empty($options['forbidden_extensions']))
600 {
601 $explodedName = explode('.', $intendedName);
602 $explodedName = array_reverse($explodedName);
603 array_pop($explodedName);
604 $explodedName = array_map('strtolower', $explodedName);
605
606 /*
607 * DO NOT USE array_intersect HERE! array_intersect expects the two arrays to
608 * be set, i.e. they should have unique values.
609 */
610 foreach ($options['forbidden_extensions'] as $ext)
611 {
612 if (in_array($ext, $explodedName))
613 {
614 return false;
615 }
616 }
617 }
618
619 // 3. File contents scanner (PHP tag in file contents)
620 if ($options['php_tag_in_content']
621 || $options['shorttag_in_content']
622 || ($options['fobidden_ext_in_content'] && !empty($options['forbidden_extensions'])))
623 {
624 $fp = @fopen($tempName, 'r');
625
626 if ($fp !== false)
627 {
628 $data = '';
629
630 while (!feof($fp))
631 {
632 $data .= @fread($fp, 131072);
633
634 if ($options['php_tag_in_content'] && stristr($data, '<?php'))
635 {
636 return false;
637 }
638
639 if ($options['shorttag_in_content'])
640 {
641 $suspiciousExtensions = $options['shorttag_extensions'];
642
643 if (empty($suspiciousExtensions))
644 {
645 $suspiciousExtensions = array(
646 'inc', 'phps', 'class', 'php3', 'php4', 'txt', 'dat', 'tpl', 'tmpl',
647 );
648 }
649
650 /*
651 * DO NOT USE array_intersect HERE! array_intersect expects the two arrays to
652 * be set, i.e. they should have unique values.
653 */
654 $collide = false;
655
656 foreach ($suspiciousExtensions as $ext)
657 {
658 if (in_array($ext, $explodedName))
659 {
660 $collide = true;
661
662 break;
663 }
664 }
665
666 if ($collide)
667 {
668 // These are suspicious text files which may have the short tag (<?) in them
669 if (strstr($data, '<?'))
670 {
671 return false;
672 }
673 }
674 }
675
676 if ($options['fobidden_ext_in_content'] && !empty($options['forbidden_extensions']))
677 {
678 $suspiciousExtensions = $options['php_ext_content_extensions'];
679
680 if (empty($suspiciousExtensions))
681 {
682 $suspiciousExtensions = array(
683 'zip', 'rar', 'tar', 'gz', 'tgz', 'bz2', 'tbz', 'jpa',
684 );
685 }
686
687 /*
688 * DO NOT USE array_intersect HERE! array_intersect expects the two arrays to
689 * be set, i.e. they should have unique values.
690 */
691 $collide = false;
692
693 foreach ($suspiciousExtensions as $ext)
694 {
695 if (in_array($ext, $explodedName))
696 {
697 $collide = true;
698
699 break;
700 }
701 }
702
703 if ($collide)
704 {
705 /*
706 * These are suspicious text files which may have an executable
707 * file extension in them
708 */
709 foreach ($options['forbidden_extensions'] as $ext)
710 {
711 if (strstr($data, '.' . $ext))
712 {
713 return false;
714 }
715 }
716 }
717 }
718
719 /*
720 * This makes sure that we don't accidentally skip a <?php tag if it's across
721 * a read boundary, even on multibyte strings
722 */
723 $data = substr($data, -10);
724 }
725
726 fclose($fp);
727 }
728 }
729 }
730 }
731
732 return true;
733 }
734
735 /**
736 * Method to decode a file data array.
737 *
738 * @param array $data The data array to decode.
739 *
740 * @return array
741 *
742 * @since 3.4
743 */
744 protected static function decodeFileData(array $data)
745 {
746 $result = array();
747
748 if (is_array($data[0]))
749 {
750 foreach ($data[0] as $k => $v)
751 {
752 $result[$k] = self::decodeFileData(array($data[0][$k], $data[1][$k], $data[2][$k], $data[3][$k], $data[4][$k]));
753 }
754
755 return $result;
756 }
757
758 return array('name' => $data[0], 'type' => $data[1], 'tmp_name' => $data[2], 'error' => $data[3], 'size' => $data[4]);
759 }
760
761 /**
762 * Internal method to iteratively remove all unwanted tags and attributes
763 *
764 * @param string $source Input string to be 'cleaned'
765 *
766 * @return string 'Cleaned' version of input parameter
767 *
768 * @since 11.1
769 * @deprecated 4.0 Use JFilterInput::remove() instead
770 */
771 protected function _remove($source)
772 {
773 return $this->remove($source);
774 }
775
776 /**
777 * Internal method to iteratively remove all unwanted tags and attributes
778 *
779 * @param string $source Input string to be 'cleaned'
780 *
781 * @return string 'Cleaned' version of input parameter
782 *
783 * @since 3.5
784 */
785 protected function remove($source)
786 {
787 // Check for invalid UTF-8 byte sequence
788 if (!preg_match('//u', $source))
789 {
790 // String contains invalid byte sequence, remove it
791 $source = htmlspecialchars_decode(htmlspecialchars($source, ENT_IGNORE, 'UTF-8'));
792 }
793
794 // Iteration provides nested tag protection
795 do
796 {
797 $temp = $source;
798 $source = $this->_cleanTags($source);
799 }
800 while ($temp !== $source);
801
802 return $source;
803 }
804
805 /**
806 * Internal method to strip a string of certain tags
807 *
808 * @param string $source Input string to be 'cleaned'
809 *
810 * @return string 'Cleaned' version of input parameter
811 *
812 * @since 11.1
813 * @deprecated 4.0 Use JFilterInput::cleanTags() instead
814 */
815 protected function _cleanTags($source)
816 {
817 return $this->cleanTags($source);
818 }
819
820 /**
821 * Internal method to strip a string of certain tags
822 *
823 * @param string $source Input string to be 'cleaned'
824 *
825 * @return string 'Cleaned' version of input parameter
826 *
827 * @since 3.5
828 */
829 protected function cleanTags($source)
830 {
831 // First, pre-process this for illegal characters inside attribute values
832 $source = $this->_escapeAttributeValues($source);
833
834 // In the beginning we don't really have a tag, so result is empty
835 $result = '';
836 $offset = 0;
837 $length = strlen($source);
838
839 // Is there a tag? If so it will certainly start with a '<'.
840 $tagOpenStartOffset = strpos($source, '<');
841
842 // Is there any close tag
843 $tagOpenEndOffset = strpos($source, '>');
844
845 while ($offset < $length)
846 {
847 // Preserve '>' character which exists before related '<'
848 if ($tagOpenEndOffset !== false && ($tagOpenStartOffset === false || $tagOpenEndOffset < $tagOpenStartOffset))
849 {
850 $result .= substr($source, $offset, $tagOpenEndOffset - $offset) . '>';
851 $offset = $tagOpenEndOffset + 1;
852
853 // Search for a new closing indicator
854 $tagOpenEndOffset = strpos($source, '>', $offset);
855
856 continue;
857 }
858
859 // Add safe text appearing before the '<'
860 if ($tagOpenStartOffset > $offset)
861 {
862 $result .= substr($source, $offset, $tagOpenStartOffset - $offset);
863 $offset = $tagOpenStartOffset;
864 }
865
866 // There is no more tags
867 if ($tagOpenStartOffset === false && $tagOpenEndOffset === false)
868 {
869 $result .= substr($source, $offset, $length - $offset);
870 $offset = $length;
871
872 break;
873 }
874
875 // Remove every '<' character if '>' does not exists or we have '<>'
876 if ($tagOpenStartOffset !== false && $tagOpenEndOffset === false || $tagOpenStartOffset + 1 == $tagOpenEndOffset)
877 {
878 $offset++;
879
880 // Search for a new opening indicator
881 $tagOpenStartOffset = strpos($source, '<', $offset);
882
883 continue;
884 }
885
886 // Check for mal-formed tag where we have a second '<' before the '>'
887 $nextOpenStartOffset = strpos($source, '<', $tagOpenStartOffset + 1);
888
889 if ($nextOpenStartOffset !== false && $nextOpenStartOffset < $tagOpenEndOffset)
890 {
891 // At this point we have a mal-formed tag, skip previous '<'
892 $offset++;
893
894 // Set a new opening indicator position
895 $tagOpenStartOffset = $nextOpenStartOffset;
896
897 continue;
898 }
899
900 // Let's get some information about our tag and setup attribute pairs
901 // Now we have something like 'span class="" style=""', '/span', 'br/', 'br /' or 'hr disabled /'
902 $tagContent = substr($source, $offset + 1, $tagOpenEndOffset - 1 - $offset);
903
904 // All ASCII whitespaces replace by 0x20
905 $tagNormalized = preg_replace('/\s/', ' ', $tagContent);
906 $tagLength = strlen($tagContent);
907 $spaceOffset = strpos($tagNormalized, ' ');
908
909 // Are we an open tag or a close tag?
910 $isClosingTag = $tagContent[0] === '/' ? 1 : 0;
911 $isSelfClosingTag = substr($tagContent, -1) === '/' ? 1 : 0;
912
913 if ($spaceOffset !== false)
914 {
915 $tagName = substr($tagContent, $isClosingTag, $spaceOffset - $isClosingTag);
916 }
917 else
918 {
919 $tagName = substr($tagContent, $isClosingTag, $tagLength - $isClosingTag - $isSelfClosingTag);
920 }
921
922 /*
923 * Exclude all "non-regular" tagnames
924 * OR no tagname
925 * OR remove if xssauto is on and tag is blacklisted
926 */
927 if (!$tagName
928 || !preg_match("/^[a-z][a-z0-9]*$/i", $tagName)
929 || ($this->xssAuto && in_array(strtolower($tagName), $this->tagBlacklist)))
930 {
931 $offset += $tagLength + 2;
932
933 $tagOpenStartOffset = strpos($source, '<', $offset);
934 $tagOpenEndOffset = strpos($source, '>', $offset);
935
936 // Strip tag
937 continue;
938 }
939
940 $attrSet = array();
941
942 /*
943 * Time to grab any attributes from the tag... need this section in
944 * case attributes have spaces in the values.
945 */
946 while ($spaceOffset !== false && $spaceOffset + 1 < $tagLength)
947 {
948 $attrStartOffset = $spaceOffset + 1;
949
950 // Find position of equal and open quote
951 if (preg_match('#= *(")[^"]*(")#', $tagNormalized, $matches, PREG_OFFSET_CAPTURE, $attrStartOffset))
952 {
953 $equalOffset = $matches[0][1];
954 $quote1Offset = $matches[1][1];
955 $quote2Offset = $matches[2][1];
956 $nextSpaceOffset = strpos($tagNormalized, ' ', $quote2Offset);
957 }
958 else
959 {
960 $equalOffset = strpos($tagNormalized, '=', $attrStartOffset);
961 $quote1Offset = strpos($tagNormalized, '"', $attrStartOffset);
962 $nextSpaceOffset = strpos($tagNormalized, ' ', $attrStartOffset);
963
964 if ($quote1Offset !== false)
965 {
966 $quote2Offset = strpos($tagNormalized, '"', $quote1Offset + 1);
967 }
968 else
969 {
970 $quote2Offset = false;
971 }
972 }
973
974 // Do we have an attribute to process? [check for equal sign]
975 if ($tagContent[$attrStartOffset] !== '/'
976 && ($equalOffset && $nextSpaceOffset && $nextSpaceOffset < $equalOffset || !$equalOffset))
977 {
978 // Search for attribute without value, ex: 'checked/' or 'checked '
979 if ($nextSpaceOffset)
980 {
981 $attrEndOffset = $nextSpaceOffset;
982 }
983 else
984 {
985 $attrEndOffset = strpos($tagContent, '/', $attrStartOffset);
986
987 if ($attrEndOffset === false)
988 {
989 $attrEndOffset = $tagLength;
990 }
991 }
992
993 // If there is an ending, use this, if not, do not worry.
994 if ($attrEndOffset > $attrStartOffset)
995 {
996 $attrSet[] = substr($tagContent, $attrStartOffset, $attrEndOffset - $attrStartOffset);
997 }
998 }
999 elseif ($equalOffset !== false)
1000 {
1001 /*
1002 * If the attribute value is wrapped in quotes we need to grab the substring from
1003 * the closing quote, otherwise grab until the next space.
1004 */
1005 if ($quote1Offset !== false && $quote2Offset !== false)
1006 {
1007 // Add attribute, ex: 'class="body abc"'
1008 $attrSet[] = substr($tagContent, $attrStartOffset, $quote2Offset + 1 - $attrStartOffset);
1009 }
1010 else
1011 {
1012 if ($nextSpaceOffset)
1013 {
1014 $attrEndOffset = $nextSpaceOffset;
1015 }
1016 else
1017 {
1018 $attrEndOffset = $tagLength;
1019 }
1020
1021 // Add attribute, ex: 'class=body'
1022 $attrSet[] = substr($tagContent, $attrStartOffset, $attrEndOffset - $attrStartOffset);
1023 }
1024 }
1025
1026 $spaceOffset = $nextSpaceOffset;
1027 }
1028
1029 // Is our tag in the user input array?
1030 $tagFound = in_array(strtolower($tagName), $this->tagsArray);
1031
1032 // If the tag is allowed let's append it to the output string.
1033 if ((!$tagFound && $this->tagsMethod) || ($tagFound && !$this->tagsMethod))
1034 {
1035 // Reconstruct tag with allowed attributes
1036 if ($isClosingTag)
1037 {
1038 $result .= "</$tagName>";
1039 }
1040 else
1041 {
1042 $attrSet = $this->_cleanAttributes($attrSet);
1043
1044 // Open or single tag
1045 $result .= '<' . $tagName;
1046
1047 if ($attrSet)
1048 {
1049 $result .= ' ' . implode(' ', $attrSet);
1050 }
1051
1052 // Reformat single tags to XHTML
1053 if (strpos($source, "</$tagName>", $tagOpenStartOffset) !== false)
1054 {
1055 $result .= '>';
1056 }
1057 else
1058 {
1059 $result .= ' />';
1060 }
1061 }
1062 }
1063
1064 $offset += $tagLength + 2;
1065
1066 if ($offset < $length)
1067 {
1068 // Find next tag's start and continue iteration
1069 $tagOpenStartOffset = strpos($source, '<', $offset);
1070 $tagOpenEndOffset = strpos($source, '>', $offset);
1071 }
1072 }
1073
1074 return $result;
1075 }
1076
1077 /**
1078 * Internal method to strip a tag of certain attributes
1079 *
1080 * @param array $attrSet Array of attribute pairs to filter
1081 *
1082 * @return array Filtered array of attribute pairs
1083 *
1084 * @since 11.1
1085 * @deprecated 4.0 Use JFilterInput::cleanAttributes() instead
1086 */
1087 protected function _cleanAttributes($attrSet)
1088 {
1089 return $this->cleanAttributes($attrSet);
1090 }
1091
1092 /**
1093 * Escape < > and " inside attribute values
1094 *
1095 * @param string $source The source string.
1096 *
1097 * @return string Filtered string
1098 *
1099 * @since 3.5
1100 */
1101 protected function escapeAttributeValues($source)
1102 {
1103 $alreadyFiltered = '';
1104 $remainder = $source;
1105 $badChars = array('<', '"', '>');
1106 $escapedChars = array('<', '"', '>');
1107
1108 /*
1109 * Process each portion based on presence of =" and "<space>, "/>, or ">
1110 * See if there are any more attributes to process
1111 */
1112 while (preg_match('#<[^>]*?=\s*?(\"|\')#s', $remainder, $matches, PREG_OFFSET_CAPTURE))
1113 {
1114 // Get the portion before the attribute value
1115 $quotePosition = $matches[0][1];
1116 $nextBefore = $quotePosition + strlen($matches[0][0]);
1117
1118 /*
1119 * Figure out if we have a single or double quote and look for the matching closing quote
1120 * Closing quote should be "/>, ">, "<space>, or " at the end of the string
1121 */
1122 $quote = substr($matches[0][0], -1);
1123 $pregMatch = ($quote == '"') ? '#(\"\s*/\s*>|\"\s*>|\"\s+|\"$)#' : "#(\'\s*/\s*>|\'\s*>|\'\s+|\'$)#";
1124
1125 // Get the portion after attribute value
1126 if (preg_match($pregMatch, substr($remainder, $nextBefore), $matches, PREG_OFFSET_CAPTURE))
1127 {
1128 // We have a closing quote
1129 $nextAfter = $nextBefore + $matches[0][1];
1130 }
1131 else
1132 {
1133 // No closing quote
1134 $nextAfter = strlen($remainder);
1135 }
1136
1137 // Get the actual attribute value
1138 $attributeValue = substr($remainder, $nextBefore, $nextAfter - $nextBefore);
1139
1140 // Escape bad chars
1141 $attributeValue = str_replace($badChars, $escapedChars, $attributeValue);
1142 $attributeValue = $this->_stripCSSExpressions($attributeValue);
1143 $alreadyFiltered .= substr($remainder, 0, $nextBefore) . $attributeValue . $quote;
1144 $remainder = substr($remainder, $nextAfter + 1);
1145 }
1146
1147 // At this point, we just have to return the $alreadyFiltered and the $remainder
1148 return $alreadyFiltered . $remainder;
1149 }
1150
1151 /**
1152 * Try to convert to plaintext
1153 *
1154 * @param string $source The source string.
1155 *
1156 * @return string Plaintext string
1157 *
1158 * @since 11.1
1159 * @deprecated 4.0 Use JFilterInput::decode() instead
1160 */
1161 protected function _decode($source)
1162 {
1163 return $this->decode($source);
1164 }
1165
1166 /**
1167 * Try to convert to plaintext
1168 *
1169 * @param string $source The source string.
1170 *
1171 * @return string Plaintext string
1172 *
1173 * @since 3.5
1174 */
1175 protected function decode($source)
1176 {
1177 static $ttr;
1178
1179 if (!is_array($ttr))
1180 {
1181 // Entity decode
1182 $trans_tbl = get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'ISO-8859-1');
1183
1184 foreach ($trans_tbl as $k => $v)
1185 {
1186 $ttr[$v] = utf8_encode($k);
1187 }
1188 }
1189
1190 $source = strtr($source, $ttr);
1191
1192 // Convert decimal
1193 $source = preg_replace_callback('/&#(\d+);/m', function($m)
1194 {
1195 return utf8_encode(chr($m[1]));
1196 }, $source
1197 );
1198
1199 // Convert hex
1200 $source = preg_replace_callback('/&#x([a-f0-9]+);/mi', function($m)
1201 {
1202 return utf8_encode(chr(hexdec($m[1])));
1203 }, $source
1204 );
1205
1206 return $source;
1207 }
1208
1209 /**
1210 * Escape < > and " inside attribute values
1211 *
1212 * @param string $source The source string.
1213 *
1214 * @return string Filtered string
1215 *
1216 * @since 11.1
1217 * @deprecated 4.0 Use JFilterInput::escapeAttributeValues() instead
1218 */
1219 protected function _escapeAttributeValues($source)
1220 {
1221 return $this->escapeAttributeValues($source);
1222 }
1223
1224 /**
1225 * Remove CSS Expressions in the form of `<property>:expression(...)`
1226 *
1227 * @param string $source The source string.
1228 *
1229 * @return string Filtered string
1230 *
1231 * @since 11.1
1232 * @deprecated 4.0 Use JFilterInput::stripCSSExpressions() instead
1233 */
1234 protected function _stripCSSExpressions($source)
1235 {
1236 return $this->stripCSSExpressions($source);
1237 }
1238
1239 /**
1240 * Recursively strip Unicode Supplementary Characters from the source. Not: objects cannot be filtered.
1241 *
1242 * @param mixed $source The data to filter
1243 *
1244 * @return mixed The filtered result
1245 *
1246 * @since 3.5
1247 */
1248 protected function stripUSC($source)
1249 {
1250 if (is_object($source))
1251 {
1252 return $source;
1253 }
1254
1255 if (is_array($source))
1256 {
1257 $filteredArray = array();
1258
1259 foreach ($source as $k => $v)
1260 {
1261 $filteredArray[$k] = $this->stripUSC($v);
1262 }
1263
1264 return $filteredArray;
1265 }
1266
1267 return preg_replace('/[\xF0-\xF7].../s', "\xE2\xAF\x91", $source);
1268 }
1269 }
1270