1 <?php
2 /**
3 * htmlfilter.inc
4 * ---------------
5 * This set of functions allows you to filter html in order to remove
6 * any malicious tags from it. Useful in cases when you need to filter
7 * user input for any cross-site-scripting attempts.
8 *
9 * Copyright (C) 2002-2004 by Duke University
10 *
11 * This library is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
15 *
16 * This library is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with this library; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
24 * 02110-1301 USA
25 *
26 * @Author Konstantin Riabitsev <icon@linux.duke.edu>
27 * @Author Jim Jagielski <jim@jaguNET.com / jimjag@gmail.com>
28 * @Version 1.1 ($Date$)
29 */
30
31 /**
32 * This function returns the final tag out of the tag name, an array
33 * of attributes, and the type of the tag. This function is called by
34 * tln_sanitize internally.
35 *
36 * @param string $tagname the name of the tag.
37 * @param array $attary the array of attributes and their values
38 * @param integer $tagtype The type of the tag (see in comments).
39 * @return string A string with the final tag representation.
40 */
41 function tln_tagprint($tagname, $attary, $tagtype)
42 {
43 if ($tagtype == 2) {
44 $fulltag = '</' . $tagname . '>';
45 } else {
46 $fulltag = '<' . $tagname;
47 if (is_array($attary) && sizeof($attary)) {
48 $atts = array();
49 foreach($attary as $attname => $attvalue) {
50 array_push($atts, "$attname=$attvalue");
51 }
52 $fulltag .= ' ' . join(' ', $atts);
53 }
54 if ($tagtype == 3) {
55 $fulltag .= ' /';
56 }
57 $fulltag .= '>';
58 }
59 return $fulltag;
60 }
61
62 /**
63 * A small helper function to use with array_walk. Modifies a by-ref
64 * value and makes it lowercase.
65 *
66 * @param string $val a value passed by-ref.
67 * @return void since it modifies a by-ref value.
68 */
69 function tln_casenormalize(&$val)
70 {
71 $val = strtolower($val);
72 }
73
74 /**
75 * This function skips any whitespace from the current position within
76 * a string and to the next non-whitespace value.
77 *
78 * @param string $body the string
79 * @param integer $offset the offset within the string where we should start
80 * looking for the next non-whitespace character.
81 * @return integer the location within the $body where the next
82 * non-whitespace char is located.
83 */
84 function tln_skipspace($body, $offset)
85 {
86 preg_match('/^(\s*)/s', substr($body, $offset), $matches);
87 if (sizeof($matches[1])) {
88 $count = strlen($matches[1]);
89 $offset += $count;
90 }
91 return $offset;
92 }
93
94 /**
95 * This function looks for the next character within a string. It's
96 * really just a glorified "strpos", except it catches the failures
97 * nicely.
98 *
99 * @param string $body The string to look for needle in.
100 * @param integer $offset Start looking from this position.
101 * @param string $needle The character/string to look for.
102 * @return integer location of the next occurrence of the needle, or
103 * strlen($body) if needle wasn't found.
104 */
105 function tln_findnxstr($body, $offset, $needle)
106 {
107 $pos = strpos($body, $needle, $offset);
108 if ($pos === false) {
109 $pos = strlen($body);
110 }
111 return $pos;
112 }
113
114 /**
115 * This function takes a PCRE-style regexp and tries to match it
116 * within the string.
117 *
118 * @param string $body The string to look for needle in.
119 * @param integer $offset Start looking from here.
120 * @param string $reg A PCRE-style regex to match.
121 * @return array|boolean Returns a false if no matches found, or an array
122 * with the following members:
123 * - integer with the location of the match within $body
124 * - string with whatever content between offset and the match
125 * - string with whatever it is we matched
126 */
127 function tln_findnxreg($body, $offset, $reg)
128 {
129 $matches = array();
130 $retarr = array();
131 $preg_rule = '%^(.*?)(' . $reg . ')%s';
132 preg_match($preg_rule, substr($body, $offset), $matches);
133 if (!isset($matches[0]) || !$matches[0]) {
134 $retarr = false;
135 } else {
136 $retarr[0] = $offset + strlen($matches[1]);
137 $retarr[1] = $matches[1];
138 $retarr[2] = $matches[2];
139 }
140 return $retarr;
141 }
142
143 /**
144 * This function looks for the next tag.
145 *
146 * @param string $body String where to look for the next tag.
147 * @param integer $offset Start looking from here.
148 * @return array|boolean false if no more tags exist in the body, or
149 * an array with the following members:
150 * - string with the name of the tag
151 * - array with attributes and their values
152 * - integer with tag type (1, 2, or 3)
153 * - integer where the tag starts (starting "<")
154 * - integer where the tag ends (ending ">")
155 * first three members will be false, if the tag is invalid.
156 */
157 function tln_getnxtag($body, $offset)
158 {
159 if ($offset > strlen($body)) {
160 return false;
161 }
162 $lt = tln_findnxstr($body, $offset, '<');
163 if ($lt == strlen($body)) {
164 return false;
165 }
166 /**
167 * We are here:
168 * blah blah <tag attribute="value">
169 * \---------^
170 */
171 $pos = tln_skipspace($body, $lt + 1);
172 if ($pos >= strlen($body)) {
173 return array(false, false, false, $lt, strlen($body));
174 }
175 /**
176 * There are 3 kinds of tags:
177 * 1. Opening tag, e.g.:
178 * <a href="blah">
179 * 2. Closing tag, e.g.:
180 * </a>
181 * 3. XHTML-style content-less tag, e.g.:
182 * <img src="blah"/>
183 */
184 switch (substr($body, $pos, 1)) {
185 case '/':
186 $tagtype = 2;
187 $pos++;
188 break;
189 case '!':
190 /**
191 * A comment or an SGML declaration.
192 */
193 if (substr($body, $pos + 1, 2) == '--') {
194 $gt = strpos($body, '-->', $pos);
195 if ($gt === false) {
196 $gt = strlen($body);
197 } else {
198 $gt += 2;
199 }
200 return array(false, false, false, $lt, $gt);
201 } else {
202 $gt = tln_findnxstr($body, $pos, '>');
203 return array(false, false, false, $lt, $gt);
204 }
205 break;
206 default:
207 /**
208 * Assume tagtype 1 for now. If it's type 3, we'll switch values
209 * later.
210 */
211 $tagtype = 1;
212 break;
213 }
214
215 /**
216 * Look for next [\W-_], which will indicate the end of the tag name.
217 */
218 $regary = tln_findnxreg($body, $pos, '[^\w\-_]');
219 if ($regary == false) {
220 return array(false, false, false, $lt, strlen($body));
221 }
222 list($pos, $tagname, $match) = $regary;
223 $tagname = strtolower($tagname);
224
225 /**
226 * $match can be either of these:
227 * '>' indicating the end of the tag entirely.
228 * '\s' indicating the end of the tag name.
229 * '/' indicating that this is type-3 xhtml tag.
230 *
231 * Whatever else we find there indicates an invalid tag.
232 */
233 switch ($match) {
234 case '/':
235 /**
236 * This is an xhtml-style tag with a closing / at the
237 * end, like so: <img src="blah"/>. Check if it's followed
238 * by the closing bracket. If not, then this tag is invalid
239 */
240 if (substr($body, $pos, 2) == '/>') {
241 $pos++;
242 $tagtype = 3;
243 } else {
244 $gt = tln_findnxstr($body, $pos, '>');
245 $retary = array(false, false, false, $lt, $gt);
246 return $retary;
247 }
248 //intentional fall-through
249 case '>':
250 return array($tagname, false, $tagtype, $lt, $pos);
251 break;
252 default:
253 /**
254 * Check if it's whitespace
255 */
256 if (!preg_match('/\s/', $match)) {
257 /**
258 * This is an invalid tag! Look for the next closing ">".
259 */
260 $gt = tln_findnxstr($body, $lt, '>');
261 return array(false, false, false, $lt, $gt);
262 }
263 break;
264 }
265
266 /**
267 * At this point we're here:
268 * <tagname attribute='blah'>
269 * \-------^
270 *
271 * At this point we loop in order to find all attributes.
272 */
273 $attary = array();
274
275 while ($pos <= strlen($body)) {
276 $pos = tln_skipspace($body, $pos);
277 if ($pos == strlen($body)) {
278 /**
279 * Non-closed tag.
280 */
281 return array(false, false, false, $lt, $pos);
282 }
283 /**
284 * See if we arrived at a ">" or "/>", which means that we reached
285 * the end of the tag.
286 */
287 $matches = array();
288 if (preg_match('%^(\s*)(>|/>)%s', substr($body, $pos), $matches)) {
289 /**
290 * Yep. So we did.
291 */
292 $pos += strlen($matches[1]);
293 if ($matches[2] == '/>') {
294 $tagtype = 3;
295 $pos++;
296 }
297 return array($tagname, $attary, $tagtype, $lt, $pos);
298 }
299
300 /**
301 * There are several types of attributes, with optional
302 * [:space:] between members.
303 * Type 1:
304 * attrname[:space:]=[:space:]'CDATA'
305 * Type 2:
306 * attrname[:space:]=[:space:]"CDATA"
307 * Type 3:
308 * attr[:space:]=[:space:]CDATA
309 * Type 4:
310 * attrname
311 *
312 * We leave types 1 and 2 the same, type 3 we check for
313 * '"' and convert to """ if needed, then wrap in
314 * double quotes. Type 4 we convert into:
315 * attrname="yes".
316 */
317 $regary = tln_findnxreg($body, $pos, '[^\w\-_]');
318 if ($regary == false) {
319 /**
320 * Looks like body ended before the end of tag.
321 */
322 return array(false, false, false, $lt, strlen($body));
323 }
324 list($pos, $attname, $match) = $regary;
325 $attname = strtolower($attname);
326 /**
327 * We arrived at the end of attribute name. Several things possible
328 * here:
329 * '>' means the end of the tag and this is attribute type 4
330 * '/' if followed by '>' means the same thing as above
331 * '\s' means a lot of things -- look what it's followed by.
332 * anything else means the attribute is invalid.
333 */
334 switch ($match) {
335 case '/':
336 /**
337 * This is an xhtml-style tag with a closing / at the
338 * end, like so: <img src="blah"/>. Check if it's followed
339 * by the closing bracket. If not, then this tag is invalid
340 */
341 if (substr($body, $pos, 2) == '/>') {
342 $pos++;
343 $tagtype = 3;
344 } else {
345 $gt = tln_findnxstr($body, $pos, '>');
346 $retary = array(false, false, false, $lt, $gt);
347 return $retary;
348 }
349 //intentional fall-through
350 case '>':
351 $attary{$attname} = '"yes"';
352 return array($tagname, $attary, $tagtype, $lt, $pos);
353 break;
354 default:
355 /**
356 * Skip whitespace and see what we arrive at.
357 */
358 $pos = tln_skipspace($body, $pos);
359 $char = substr($body, $pos, 1);
360 /**
361 * Two things are valid here:
362 * '=' means this is attribute type 1 2 or 3.
363 * \w means this was attribute type 4.
364 * anything else we ignore and re-loop. End of tag and
365 * invalid stuff will be caught by our checks at the beginning
366 * of the loop.
367 */
368 if ($char == '=') {
369 $pos++;
370 $pos = tln_skipspace($body, $pos);
371 /**
372 * Here are 3 possibilities:
373 * "'" attribute type 1
374 * '"' attribute type 2
375 * everything else is the content of tag type 3
376 */
377 $quot = substr($body, $pos, 1);
378 if ($quot == '\'') {
379 $regary = tln_findnxreg($body, $pos + 1, '\'');
380 if ($regary == false) {
381 return array(false, false, false, $lt, strlen($body));
382 }
383 list($pos, $attval, $match) = $regary;
384 $pos++;
385 $attary{$attname} = '\'' . $attval . '\'';
386 } elseif ($quot == '"') {
387 $regary = tln_findnxreg($body, $pos + 1, '\"');
388 if ($regary == false) {
389 return array(false, false, false, $lt, strlen($body));
390 }
391 list($pos, $attval, $match) = $regary;
392 $pos++;
393 $attary{$attname} = '"' . $attval . '"';
394 } else {
395 /**
396 * These are hateful. Look for \s, or >.
397 */
398 $regary = tln_findnxreg($body, $pos, '[\s>]');
399 if ($regary == false) {
400 return array(false, false, false, $lt, strlen($body));
401 }
402 list($pos, $attval, $match) = $regary;
403 /**
404 * If it's ">" it will be caught at the top.
405 */
406 $attval = preg_replace('/\"/s', '"', $attval);
407 $attary{$attname} = '"' . $attval . '"';
408 }
409 } elseif (preg_match('|[\w/>]|', $char)) {
410 /**
411 * That was attribute type 4.
412 */
413 $attary{$attname} = '"yes"';
414 } else {
415 /**
416 * An illegal character. Find next '>' and return.
417 */
418 $gt = tln_findnxstr($body, $pos, '>');
419 return array(false, false, false, $lt, $gt);
420 }
421 break;
422 }
423 }
424 /**
425 * The fact that we got here indicates that the tag end was never
426 * found. Return invalid tag indication so it gets stripped.
427 */
428 return array(false, false, false, $lt, strlen($body));
429 }
430
431 /**
432 * Translates entities into literal values so they can be checked.
433 *
434 * @param string $attvalue the by-ref value to check.
435 * @param string $regex the regular expression to check against.
436 * @param boolean $hex whether the entities are hexadecimal.
437 * @return boolean True or False depending on whether there were matches.
438 */
439 function tln_deent(&$attvalue, $regex, $hex = false)
440 {
441 preg_match_all($regex, $attvalue, $matches);
442 if (is_array($matches) && sizeof($matches[0]) > 0) {
443 $repl = array();
444 for ($i = 0; $i < sizeof($matches[0]); $i++) {
445 $numval = $matches[1][$i];
446 if ($hex) {
447 $numval = hexdec($numval);
448 }
449 $repl{$matches[0][$i]} = chr($numval);
450 }
451 $attvalue = strtr($attvalue, $repl);
452 return true;
453 } else {
454 return false;
455 }
456 }
457
458 /**
459 * This function checks attribute values for entity-encoded values
460 * and returns them translated into 8-bit strings so we can run
461 * checks on them.
462 *
463 * @param string $attvalue A string to run entity check against.
464 */
465 function tln_defang(&$attvalue)
466 {
467 /**
468 * Skip this if there aren't ampersands or backslashes.
469 */
470 if (strpos($attvalue, '&') === false
471 && strpos($attvalue, '\\') === false
472 ) {
473 return;
474 }
475 do {
476 $m = false;
477 $m = $m || tln_deent($attvalue, '/\�*(\d+);*/s');
478 $m = $m || tln_deent($attvalue, '/\�*((\d|[a-f])+);*/si', true);
479 $m = $m || tln_deent($attvalue, '/\\\\(\d+)/s', true);
480 } while ($m == true);
481 $attvalue = stripslashes($attvalue);
482 }
483
484 /**
485 * Kill any tabs, newlines, or carriage returns. Our friends the
486 * makers of the browser with 95% market value decided that it'd
487 * be funny to make "java[tab]script" be just as good as "javascript".
488 *
489 * @param string $attvalue The attribute value before extraneous spaces removed.
490 */
491 function tln_unspace(&$attvalue)
492 {
493 if (strcspn($attvalue, "\t\r\n\0 ") != strlen($attvalue)) {
494 $attvalue = str_replace(
495 array("\t", "\r", "\n", "\0", " "),
496 array('', '', '', '', ''),
497 $attvalue
498 );
499 }
500 }
501
502 /**
503 * This function runs various checks against the attributes.
504 *
505 * @param string $tagname String with the name of the tag.
506 * @param array $attary Array with all tag attributes.
507 * @param array $rm_attnames See description for tln_sanitize
508 * @param array $bad_attvals See description for tln_sanitize
509 * @param array $add_attr_to_tag See description for tln_sanitize
510 * @param string $trans_image_path
511 * @param boolean $block_external_images
512 * @return array with modified attributes.
513 */
514 function tln_fixatts(
515 $tagname,
516 $attary,
517 $rm_attnames,
518 $bad_attvals,
519 $add_attr_to_tag,
520 $trans_image_path,
521 $block_external_images
522 ) {
523 foreach($attary as $attname => $attvalue) {
524 /**
525 * See if this attribute should be removed.
526 */
527 foreach ($rm_attnames as $matchtag => $matchattrs) {
528 if (preg_match($matchtag, $tagname)) {
529 foreach ($matchattrs as $matchattr) {
530 if (preg_match($matchattr, $attname)) {
531 unset($attary{$attname});
532 continue;
533 }
534 }
535 }
536 }
537 /**
538 * Remove any backslashes, entities, or extraneous whitespace.
539 */
540 $oldattvalue = $attvalue;
541 tln_defang($attvalue);
542 if ($attname == 'style' && $attvalue !== $oldattvalue) {
543 $attvalue = "idiocy";
544 $attary{$attname} = $attvalue;
545 }
546 tln_unspace($attvalue);
547
548 /**
549 * Now let's run checks on the attvalues.
550 * I don't expect anyone to comprehend this. If you do,
551 * get in touch with me so I can drive to where you live and
552 * shake your hand personally. :)
553 */
554 foreach ($bad_attvals as $matchtag => $matchattrs) {
555 if (preg_match($matchtag, $tagname)) {
556 foreach ($matchattrs as $matchattr => $valary) {
557 if (preg_match($matchattr, $attname)) {
558 /**
559 * There are two arrays in valary.
560 * First is matches.
561 * Second one is replacements
562 */
563 list($valmatch, $valrepl) = $valary;
564 $newvalue = preg_replace($valmatch, $valrepl, $attvalue);
565 if ($newvalue != $attvalue) {
566 $attary{$attname} = $newvalue;
567 $attvalue = $newvalue;
568 }
569 }
570 }
571 }
572 }
573 if ($attname == 'style') {
574 if (preg_match('/[\0-\37\200-\377]+/', $attvalue)) {
575 $attary{$attname} = '"disallowed character"';
576 }
577 preg_match_all("/url\s*\((.+)\)/si", $attvalue, $aMatch);
578 if (count($aMatch)) {
579 foreach($aMatch[1] as $sMatch) {
580 $urlvalue = $sMatch;
581 tln_fixurl($attname, $urlvalue, $trans_image_path, $block_external_images);
582 $attary{$attname} = str_replace($sMatch, $urlvalue, $attvalue);
583 }
584 }
585 }
586 }
587 /**
588 * See if we need to append any attributes to this tag.
589 */
590 foreach ($add_attr_to_tag as $matchtag => $addattary) {
591 if (preg_match($matchtag, $tagname)) {
592 $attary = array_merge($attary, $addattary);
593 }
594 }
595 return $attary;
596 }
597
598 function tln_fixurl($attname, &$attvalue, $trans_image_path, $block_external_images)
599 {
600 $sQuote = '"';
601 $attvalue = trim($attvalue);
602 if ($attvalue && ($attvalue[0] =='"'|| $attvalue[0] == "'")) {
603 // remove the double quotes
604 $sQuote = $attvalue[0];
605 $attvalue = trim(substr($attvalue,1,-1));
606 }
607
608 /**
609 * Replace empty src tags with the blank image. src is only used
610 * for frames, images, and image inputs. Doing a replace should
611 * not affect them working as should be, however it will stop
612 * IE from being kicked off when src for img tags are not set
613 */
614 if ($attvalue == '') {
615 $attvalue = $sQuote . $trans_image_path . $sQuote;
616 } else {
617 // first, disallow 8 bit characters and control characters
618 if (preg_match('/[\0-\37\200-\377]+/',$attvalue)) {
619 switch ($attname) {
620 case 'href':
621 $attvalue = $sQuote . 'http://invalid-stuff-detected.example.com' . $sQuote;
622 break;
623 default:
624 $attvalue = $sQuote . $trans_image_path . $sQuote;
625 break;
626 }
627 } else {
628 $aUrl = parse_url($attvalue);
629 if (isset($aUrl['scheme'])) {
630 switch(strtolower($aUrl['scheme'])) {
631 case 'mailto':
632 case 'http':
633 case 'https':
634 case 'ftp':
635 if ($attname != 'href') {
636 if ($block_external_images == true) {
637 $attvalue = $sQuote . $trans_image_path . $sQuote;
638 } else {
639 if (!isset($aUrl['path'])) {
640 $attvalue = $sQuote . $trans_image_path . $sQuote;
641 }
642 }
643 } else {
644 $attvalue = $sQuote . $attvalue . $sQuote;
645 }
646 break;
647 case 'outbind':
648 $attvalue = $sQuote . $attvalue . $sQuote;
649 break;
650 case 'cid':
651 $attvalue = $sQuote . $attvalue . $sQuote;
652 break;
653 default:
654 $attvalue = $sQuote . $trans_image_path . $sQuote;
655 break;
656 }
657 } else {
658 if (!isset($aUrl['path']) || $aUrl['path'] != $trans_image_path) {
659 $$attvalue = $sQuote . $trans_image_path . $sQuote;
660 }
661 }
662 }
663 }
664 }
665
666 function tln_fixstyle($body, $pos, $trans_image_path, $block_external_images)
667 {
668 // workaround for </style> in between comments
669 $content = '';
670 $sToken = '';
671 $bSucces = false;
672 $bEndTag = false;
673 for ($i=$pos,$iCount=strlen($body);$i<$iCount;++$i) {
674 $char = $body{$i};
675 switch ($char) {
676 case '<':
677 $sToken = $char;
678 break;
679 case '/':
680 if ($sToken == '<') {
681 $sToken .= $char;
682 $bEndTag = true;
683 } else {
684 $content .= $char;
685 }
686 break;
687 case '>':
688 if ($bEndTag) {
689 $sToken .= $char;
690 if (preg_match('/\<\/\s*style\s*\>/i',$sToken,$aMatch)) {
691 $newpos = $i + 1;
692 $bSucces = true;
693 break 2;
694 } else {
695 $content .= $sToken;
696 }
697 $bEndTag = false;
698 } else {
699 $content .= $char;
700 }
701 break;
702 case '!':
703 if ($sToken == '<') {
704 // possible comment
705 if (isset($body{$i+2}) && substr($body,$i,3) == '!--') {
706 $i = strpos($body,'-->',$i+3);
707 if ($i === false) { // no end comment
708 $i = strlen($body);
709 }
710 $sToken = '';
711 }
712 } else {
713 $content .= $char;
714 }
715 break;
716 default:
717 if ($bEndTag) {
718 $sToken .= $char;
719 } else {
720 $content .= $char;
721 }
722 break;
723 }
724 }
725 if ($bSucces == FALSE){
726 return array(FALSE, strlen($body));
727 }
728
729
730
731 /**
732 * First look for general BODY style declaration, which would be
733 * like so:
734 * body {background: blah-blah}
735 * and change it to .bodyclass so we can just assign it to a <div>
736 */
737 $content = preg_replace("|body(\s*\{.*?\})|si", ".bodyclass\\1", $content);
738
739 /**
740 * Fix url('blah') declarations.
741 */
742 // $content = preg_replace("|url\s*\(\s*([\'\"])\s*\S+script\s*:.*?([\'\"])\s*\)|si",
743 // "url(\\1$trans_image_path\\2)", $content);
744
745 // first check for 8bit sequences and disallowed control characters
746 if (preg_match('/[\16-\37\200-\377]+/',$content)) {
747 $content = '<!-- style block removed by html filter due to presence of 8bit characters -->';
748 return array($content, $newpos);
749 }
750
751 // remove @import line
752 $content = preg_replace("/^\s*(@import.*)$/mi","\n<!-- @import rules forbidden -->\n",$content);
753
754 $content = preg_replace("/(\\\\)?u(\\\\)?r(\\\\)?l(\\\\)?/i", 'url', $content);
755 preg_match_all("/url\s*\((.+)\)/si",$content,$aMatch);
756 if (count($aMatch)) {
757 $aValue = $aReplace = array();
758 foreach($aMatch[1] as $sMatch) {
759 // url value
760 $urlvalue = $sMatch;
761 tln_fixurl('style',$urlvalue, $trans_image_path, $block_external_images);
762 $aValue[] = $sMatch;
763 $aReplace[] = $urlvalue;
764 }
765 $content = str_replace($aValue,$aReplace,$content);
766 }
767
768 /**
769 * Remove any backslashes, entities, and extraneous whitespace.
770 */
771 $contentTemp = $content;
772 tln_defang($contentTemp);
773 tln_unspace($contentTemp);
774
775 $match = array('/\/\*.*\*\//',
776 '/expression/i',
777 '/behaviou*r/i',
778 '/binding/i',
779 '/include-source/i',
780 '/javascript/i',
781 '/script/i',
782 '/position/i');
783 $replace = array('','idiocy', 'idiocy', 'idiocy', 'idiocy', 'idiocy', 'idiocy', '');
784 $contentNew = preg_replace($match, $replace, $contentTemp);
785 if ($contentNew !== $contentTemp) {
786 $content = $contentNew;
787 }
788 return array($content, $newpos);
789 }
790
791 function tln_body2div($attary, $trans_image_path)
792 {
793 $divattary = array('class' => "'bodyclass'");
794 $text = '#000000';
795 $has_bgc_stl = $has_txt_stl = false;
796 $styledef = '';
797 if (is_array($attary) && sizeof($attary) > 0){
798 foreach ($attary as $attname=>$attvalue){
799 $quotchar = substr($attvalue, 0, 1);
800 $attvalue = str_replace($quotchar, "", $attvalue);
801 switch ($attname){
802 case 'background':
803 $styledef .= "background-image: url('$trans_image_path'); ";
804 break;
805 case 'bgcolor':
806 $has_bgc_stl = true;
807 $styledef .= "background-color: $attvalue; ";
808 break;
809 case 'text':
810 $has_txt_stl = true;
811 $styledef .= "color: $attvalue; ";
812 break;
813 }
814 }
815 // Outlook defines a white bgcolor and no text color. This can lead to
816 // white text on a white bg with certain themes.
817 if ($has_bgc_stl && !$has_txt_stl) {
818 $styledef .= "color: $text; ";
819 }
820 if (strlen($styledef) > 0){
821 $divattary{"style"} = "\"$styledef\"";
822 }
823 }
824 return $divattary;
825 }
826
827 /**
828 *
829 * @param string $body The HTML you wish to filter
830 * @param array $tag_list see description above
831 * @param array $rm_tags_with_content see description above
832 * @param array $self_closing_tags see description above
833 * @param boolean $force_tag_closing see description above
834 * @param array $rm_attnames see description above
835 * @param array $bad_attvals see description above
836 * @param array $add_attr_to_tag see description above
837 * @param string $trans_image_path
838 * @param boolean $block_external_images
839
840 * @return string Sanitized html safe to show on your pages.
841 */
842 function tln_sanitize(
843 $body,
844 $tag_list,
845 $rm_tags_with_content,
846 $self_closing_tags,
847 $force_tag_closing,
848 $rm_attnames,
849 $bad_attvals,
850 $add_attr_to_tag,
851 $trans_image_path,
852 $block_external_images
853 ) {
854 /**
855 * Normalize rm_tags and rm_tags_with_content.
856 */
857 $rm_tags = array_shift($tag_list);
858 @array_walk($tag_list, 'tln_casenormalize');
859 @array_walk($rm_tags_with_content, 'tln_casenormalize');
860 @array_walk($self_closing_tags, 'tln_casenormalize');
861 /**
862 * See if tag_list is of tags to remove or tags to allow.
863 * false means remove these tags
864 * true means allow these tags
865 */
866 $curpos = 0;
867 $open_tags = array();
868 $trusted = "<!-- begin tln_sanitized html -->\n";
869 $skip_content = false;
870 /**
871 * Take care of netscape's stupid javascript entities like
872 * &{alert('boo')};
873 */
874 $body = preg_replace('/&(\{.*?\};)/si', '&\\1', $body);
875 while (($curtag = tln_getnxtag($body, $curpos)) != false) {
876 list($tagname, $attary, $tagtype, $lt, $gt) = $curtag;
877 $free_content = substr($body, $curpos, $lt-$curpos);
878 /**
879 * Take care of <style>
880 */
881 if ($tagname == "style" && $tagtype == 1){
882 list($free_content, $curpos) =
883 tln_fixstyle($body, $gt+1, $trans_image_path, $block_external_images);
884 if ($free_content != FALSE){
885 if ( !empty($attary) ) {
886 $attary = tln_fixatts($tagname,
887 $attary,
888 $rm_attnames,
889 $bad_attvals,
890 $add_attr_to_tag,
891 $trans_image_path,
892 $block_external_images
893 );
894 }
895 $trusted .= tln_tagprint($tagname, $attary, $tagtype);
896 $trusted .= $free_content;
897 $trusted .= tln_tagprint($tagname, null, 2);
898 }
899 continue;
900 }
901 if ($skip_content == false){
902 $trusted .= $free_content;
903 }
904 if ($tagname != false) {
905 if ($tagtype == 2) {
906 if ($skip_content == $tagname) {
907 /**
908 * Got to the end of tag we needed to remove.
909 */
910 $tagname = false;
911 $skip_content = false;
912 } else {
913 if ($skip_content == false) {
914 if ($tagname == "body") {
915 $tagname = "div";
916 }
917 if (isset($open_tags{$tagname}) &&
918 $open_tags{$tagname} > 0
919 ) {
920 $open_tags{$tagname}--;
921 } else {
922 $tagname = false;
923 }
924 }
925 }
926 } else {
927 /**
928 * $rm_tags_with_content
929 */
930 if ($skip_content == false) {
931 /**
932 * See if this is a self-closing type and change
933 * tagtype appropriately.
934 */
935 if ($tagtype == 1
936 && in_array($tagname, $self_closing_tags)
937 ) {
938 $tagtype = 3;
939 }
940 /**
941 * See if we should skip this tag and any content
942 * inside it.
943 */
944 if ($tagtype == 1
945 && in_array($tagname, $rm_tags_with_content)
946 ) {
947 $skip_content = $tagname;
948 } else {
949 if (($rm_tags == false
950 && in_array($tagname, $tag_list)) ||
951 ($rm_tags == true
952 && !in_array($tagname, $tag_list))
953 ) {
954 $tagname = false;
955 } else {
956 /**
957 * Convert body into div.
958 */
959 if ($tagname == "body"){
960 $tagname = "div";
961 $attary = tln_body2div($attary, $trans_image_path);
962 }
963 if ($tagtype == 1) {
964 if (isset($open_tags{$tagname})) {
965 $open_tags{$tagname}++;
966 } else {
967 $open_tags{$tagname} = 1;
968 }
969 }
970 /**
971 * This is where we run other checks.
972 */
973 if (is_array($attary) && sizeof($attary) > 0) {
974 $attary = tln_fixatts(
975 $tagname,
976 $attary,
977 $rm_attnames,
978 $bad_attvals,
979 $add_attr_to_tag,
980 $trans_image_path,
981 $block_external_images
982 );
983 }
984 }
985 }
986 }
987 }
988 if ($tagname != false && $skip_content == false) {
989 $trusted .= tln_tagprint($tagname, $attary, $tagtype);
990 }
991 }
992 $curpos = $gt + 1;
993 }
994 $trusted .= substr($body, $curpos, strlen($body) - $curpos);
995 if ($force_tag_closing == true) {
996 foreach ($open_tags as $tagname => $opentimes) {
997 while ($opentimes > 0) {
998 $trusted .= '</' . $tagname . '>';
999 $opentimes--;
1000 }
1001 }
1002 $trusted .= "\n";
1003 }
1004 $trusted .= "<!-- end tln_sanitized html -->\n";
1005 return $trusted;
1006 }
1007
1008 //
1009 // Use the nifty htmlfilter library
1010 //
1011
1012
1013 function HTMLFilter($body, $trans_image_path, $block_external_images = false)
1014 {
1015
1016 $tag_list = array(
1017 false,
1018 "object",
1019 "meta",
1020 "html",
1021 "head",
1022 "base",
1023 "link",
1024 "frame",
1025 "iframe",
1026 "plaintext",
1027 "marquee"
1028 );
1029
1030 $rm_tags_with_content = array(
1031 "script",
1032 "applet",
1033 "embed",
1034 "title",
1035 "frameset",
1036 "xmp",
1037 "xml"
1038 );
1039
1040 $self_closing_tags = array(
1041 "img",
1042 "br",
1043 "hr",
1044 "input",
1045 "outbind"
1046 );
1047
1048 $force_tag_closing = true;
1049
1050 $rm_attnames = array(
1051 "/.*/" =>
1052 array(
1053 // "/target/i",
1054 "/^on.*/i",
1055 "/^dynsrc/i",
1056 "/^data.*/i",
1057 "/^lowsrc.*/i"
1058 )
1059 );
1060
1061 $bad_attvals = array(
1062 "/.*/" =>
1063 array(
1064 "/^src|background/i" =>
1065 array(
1066 array(
1067 '/^([\'"])\s*\S+script\s*:.*([\'"])/si',
1068 '/^([\'"])\s*mocha\s*:*.*([\'"])/si',
1069 '/^([\'"])\s*about\s*:.*([\'"])/si'
1070 ),
1071 array(
1072 "\\1$trans_image_path\\2",
1073 "\\1$trans_image_path\\2",
1074 "\\1$trans_image_path\\2"
1075 )
1076 ),
1077 "/^href|action/i" =>
1078 array(
1079 array(
1080 '/^([\'"])\s*\S+script\s*:.*([\'"])/si',
1081 '/^([\'"])\s*mocha\s*:*.*([\'"])/si',
1082 '/^([\'"])\s*about\s*:.*([\'"])/si'
1083 ),
1084 array(
1085 "\\1#\\1",
1086 "\\1#\\1",
1087 "\\1#\\1"
1088 )
1089 ),
1090 "/^style/i" =>
1091 array(
1092 array(
1093 "/\/\*.*\*\//",
1094 "/expression/i",
1095 "/binding/i",
1096 "/behaviou*r/i",
1097 "/include-source/i",
1098 '/position\s*:/i',
1099 '/(\\\\)?u(\\\\)?r(\\\\)?l(\\\\)?/i',
1100 '/url\s*\(\s*([\'"])\s*\S+script\s*:.*([\'"])\s*\)/si',
1101 '/url\s*\(\s*([\'"])\s*mocha\s*:.*([\'"])\s*\)/si',
1102 '/url\s*\(\s*([\'"])\s*about\s*:.*([\'"])\s*\)/si',
1103 '/(.*)\s*:\s*url\s*\(\s*([\'"]*)\s*\S+script\s*:.*([\'"]*)\s*\)/si'
1104 ),
1105 array(
1106 "",
1107 "idiocy",
1108 "idiocy",
1109 "idiocy",
1110 "idiocy",
1111 "idiocy",
1112 "url",
1113 "url(\\1#\\1)",
1114 "url(\\1#\\1)",
1115 "url(\\1#\\1)",
1116 "\\1:url(\\2#\\3)"
1117 )
1118 )
1119 )
1120 );
1121
1122 if ($block_external_images) {
1123 array_push(
1124 $bad_attvals{'/.*/'}{'/^src|background/i'}[0],
1125 '/^([\'\"])\s*https*:.*([\'\"])/si'
1126 );
1127 array_push(
1128 $bad_attvals{'/.*/'}{'/^src|background/i'}[1],
1129 "\\1$trans_image_path\\1"
1130 );
1131 array_push(
1132 $bad_attvals{'/.*/'}{'/^style/i'}[0],
1133 '/url\(([\'\"])\s*https*:.*([\'\"])\)/si'
1134 );
1135 array_push(
1136 $bad_attvals{'/.*/'}{'/^style/i'}[1],
1137 "url(\\1$trans_image_path\\1)"
1138 );
1139 }
1140
1141 $add_attr_to_tag = array(
1142 "/^a$/i" =>
1143 array('target' => '"_blank"')
1144 );
1145
1146 $trusted = tln_sanitize(
1147 $body,
1148 $tag_list,
1149 $rm_tags_with_content,
1150 $self_closing_tags,
1151 $force_tag_closing,
1152 $rm_attnames,
1153 $bad_attvals,
1154 $add_attr_to_tag,
1155 $trans_image_path,
1156 $block_external_images
1157 );
1158 return $trusted;
1159 }
1160