[ Index ]

PHP Cross Reference of Moodle 1.9.3 [Build 15-Oct-2008]

title

Body

[close]

/lib/ -> kses.php (source)

   1  <?php
   2  
   3  # kses 0.2.2 - HTML/XHTML filter that only allows some elements and attributes
   4  # Copyright (C) 2002, 2003, 2005  Ulf Harnhammar
   5  #
   6  # This program is free software and open source software; you can redistribute
   7  # it and/or modify it under the terms of the GNU General Public License as
   8  # published by the Free Software Foundation; either version 2 of the License,
   9  # or (at your option) any later version.
  10  #
  11  # This program is distributed in the hope that it will be useful, but WITHOUT
  12  # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  14  # more details.
  15  #
  16  # You should have received a copy of the GNU General Public License along
  17  # with this program; if not, write to the Free Software Foundation, Inc.,
  18  # 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA  or visit
  19  # http://www.gnu.org/licenses/gpl.html
  20  #
  21  # *** CONTACT INFORMATION ***
  22  #
  23  # E-mail:      metaur at users dot sourceforge dot net
  24  # Web page:    http://sourceforge.net/projects/kses
  25  # Paper mail:  Ulf Harnhammar
  26  #              Ymergatan 17 C
  27  #              753 25  Uppsala
  28  #              SWEDEN
  29  #
  30  # [kses strips evil scripts!]
  31  
  32  
  33  function kses($string, $allowed_html, $allowed_protocols =
  34                 array('http', 'https', 'ftp', 'news', 'nntp', 'telnet',
  35                       'gopher', 'mailto'))
  36  ###############################################################################
  37  # This function makes sure that only the allowed HTML element names, attribute
  38  # names and attribute values plus only sane HTML entities will occur in
  39  # $string. You have to remove any slashes from PHP's magic quotes before you
  40  # call this function.
  41  ###############################################################################
  42  {
  43    $string = kses_no_null($string);
  44    $string = kses_js_entities($string);
  45    $string = kses_normalize_entities($string);
  46    $string = kses_hook($string);
  47    $allowed_html_fixed = kses_array_lc($allowed_html);
  48    return kses_split($string, $allowed_html_fixed, $allowed_protocols);
  49  } # function kses
  50  
  51  
  52  function kses_hook($string)
  53  ###############################################################################
  54  # You add any kses hooks here.
  55  ###############################################################################
  56  {
  57    return $string;
  58  } # function kses_hook
  59  
  60  
  61  function kses_version()
  62  ###############################################################################
  63  # This function returns kses' version number.
  64  ###############################################################################
  65  {
  66    return '0.2.2';
  67  } # function kses_version
  68  
  69  
  70  function kses_split($string, $allowed_html, $allowed_protocols)
  71  ###############################################################################
  72  # This function searches for HTML tags, no matter how malformed. It also
  73  # matches stray ">" characters.
  74  ###############################################################################
  75  {
  76    return preg_replace('%(<'.   # EITHER: <
  77                        '[^>]*'. # things that aren't >
  78                        '(>|$)'. # > or end of string
  79                        '|>)%e', # OR: just a >
  80                        "kses_split2('\\1', \$allowed_html, ".
  81                        '$allowed_protocols)',
  82                        $string);
  83  } # function kses_split
  84  
  85  
  86  function kses_split2($string, $allowed_html, $allowed_protocols)
  87  ###############################################################################
  88  # This function does a lot of work. It rejects some very malformed things
  89  # like <:::>. It returns an empty string, if the element isn't allowed (look
  90  # ma, no strip_tags()!). Otherwise it splits the tag into an element and an
  91  # attribute list.
  92  ###############################################################################
  93  {
  94    $string = kses_stripslashes($string);
  95  
  96    if (substr($string, 0, 1) != '<')
  97      return '&gt;';
  98      # It matched a ">" character
  99  
 100    if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9]+)([^>]*)>?$%', $string, $matches))
 101      return '';
 102      # It's seriously malformed
 103  
 104    $slash = trim($matches[1]);
 105    $elem = $matches[2];
 106    $attrlist = $matches[3];
 107  
 108    if (!@isset($allowed_html[strtolower($elem)]))
 109      return '';
 110      # They are using a not allowed HTML element
 111  
 112    if ($slash != '')
 113      return "<$slash$elem>";
 114    # No attributes are allowed for closing elements
 115  
 116    return kses_attr("$slash$elem", $attrlist, $allowed_html,
 117                     $allowed_protocols);
 118  } # function kses_split2
 119  
 120  
 121  function kses_attr($element, $attr, $allowed_html, $allowed_protocols)
 122  ###############################################################################
 123  # This function removes all attributes, if none are allowed for this element.
 124  # If some are allowed it calls kses_hair() to split them further, and then it
 125  # builds up new HTML code from the data that kses_hair() returns. It also
 126  # removes "<" and ">" characters, if there are any left. One more thing it
 127  # does is to check if the tag has a closing XHTML slash, and if it does,
 128  # it puts one in the returned code as well.
 129  ###############################################################################
 130  {
 131  # Is there a closing XHTML slash at the end of the attributes?
 132  
 133    $xhtml_slash = '';
 134    if (preg_match('%\s/\s*$%', $attr))
 135      $xhtml_slash = ' /';
 136  
 137  # Are any attributes allowed at all for this element?
 138  
 139    if (@count($allowed_html[strtolower($element)]) == 0)
 140      return "<$element$xhtml_slash>";
 141  
 142  # Split it
 143  
 144    $attrarr = kses_hair($attr, $allowed_protocols);
 145  
 146  # Go through $attrarr, and save the allowed attributes for this element
 147  # in $attr2
 148  
 149    $attr2 = '';
 150  
 151    foreach ($attrarr as $arreach)
 152    {
 153      if (!@isset($allowed_html[strtolower($element)]
 154                              [strtolower($arreach['name'])]))
 155        continue; # the attribute is not allowed
 156  
 157      $current = $allowed_html[strtolower($element)]
 158                              [strtolower($arreach['name'])];
 159  
 160      if (!is_array($current))
 161        $attr2 .= ' '.$arreach['whole'];
 162      # there are no checks
 163  
 164      else
 165      {
 166      # there are some checks
 167        $ok = true;
 168        foreach ($current as $currkey => $currval)
 169          if (!kses_check_attr_val($arreach['value'], $arreach['vless'],
 170                                   $currkey, $currval))
 171          { $ok = false; break; }
 172  
 173        if ($ok)
 174          $attr2 .= ' '.$arreach['whole']; # it passed them
 175      } # if !is_array($current)
 176    } # foreach
 177  
 178  # Remove any "<" or ">" characters
 179  
 180    $attr2 = preg_replace('/[<>]/', '', $attr2);
 181  
 182    return "<$element$attr2$xhtml_slash>";
 183  } # function kses_attr
 184  
 185  
 186  function kses_hair($attr, $allowed_protocols)
 187  ###############################################################################
 188  # This function does a lot of work. It parses an attribute list into an array
 189  # with attribute data, and tries to do the right thing even if it gets weird
 190  # input. It will add quotes around attribute values that don't have any quotes
 191  # or apostrophes around them, to make it easier to produce HTML code that will
 192  # conform to W3C's HTML specification. It will also remove bad URL protocols
 193  # from attribute values.
 194  ###############################################################################
 195  {
 196    $attrarr = array();
 197    $mode = 0;
 198    $attrname = '';
 199  
 200  # Loop through the whole attribute list
 201  
 202    while (strlen($attr) != 0)
 203    {
 204      $working = 0; # Was the last operation successful?
 205  
 206      switch ($mode)
 207      {
 208        case 0: # attribute name, href for instance
 209  
 210          if (preg_match('/^([-a-zA-Z]+)/', $attr, $match))
 211          {
 212            $attrname = $match[1];
 213            $working = $mode = 1;
 214            $attr = preg_replace('/^[-a-zA-Z]+/', '', $attr);
 215          }
 216  
 217          break;
 218  
 219        case 1: # equals sign or valueless ("selected")
 220  
 221          if (preg_match('/^\s*=\s*/', $attr)) # equals sign
 222          {
 223            $working = 1; $mode = 2;
 224            $attr = preg_replace('/^\s*=\s*/', '', $attr);
 225            break;
 226          }
 227  
 228          if (preg_match('/^\s+/', $attr)) # valueless
 229          {
 230            $working = 1; $mode = 0;
 231            $attrarr[] = array
 232                          ('name'  => $attrname,
 233                           'value' => '',
 234                           'whole' => $attrname,
 235                           'vless' => 'y');
 236            $attr = preg_replace('/^\s+/', '', $attr);
 237          }
 238  
 239          break;
 240  
 241        case 2: # attribute value, a URL after href= for instance
 242  
 243          if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match))
 244           # "value"
 245          {
 246            $thisval = kses_bad_protocol($match[1], $allowed_protocols);
 247  
 248            $attrarr[] = array
 249                          ('name'  => $attrname,
 250                           'value' => $thisval,
 251                           'whole' => "$attrname=\"$thisval\"",
 252                           'vless' => 'n');
 253            $working = 1; $mode = 0;
 254            $attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr);
 255            break;
 256          }
 257  
 258          if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match))
 259           # 'value'
 260          {
 261            $thisval = kses_bad_protocol($match[1], $allowed_protocols);
 262  
 263            $attrarr[] = array
 264                          ('name'  => $attrname,
 265                           'value' => $thisval,
 266                           'whole' => "$attrname='$thisval'",
 267                           'vless' => 'n');
 268            $working = 1; $mode = 0;
 269            $attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr);
 270            break;
 271          }
 272  
 273          if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match))
 274           # value
 275          {
 276            $thisval = kses_bad_protocol($match[1], $allowed_protocols);
 277  
 278            $attrarr[] = array
 279                          ('name'  => $attrname,
 280                           'value' => $thisval,
 281                           'whole' => "$attrname=\"$thisval\"",
 282                           'vless' => 'n');
 283                           # We add quotes to conform to W3C's HTML spec.
 284            $working = 1; $mode = 0;
 285            $attr = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attr);
 286          }
 287  
 288          break;
 289      } # switch
 290  
 291      if ($working == 0) # not well formed, remove and try again
 292      {
 293        $attr = kses_html_error($attr);
 294        $mode = 0;
 295      }
 296    } # while
 297  
 298    if ($mode == 1)
 299    # special case, for when the attribute list ends with a valueless
 300    # attribute like "selected"
 301      $attrarr[] = array
 302                    ('name'  => $attrname,
 303                     'value' => '',
 304                     'whole' => $attrname,
 305                     'vless' => 'y');
 306  
 307    return $attrarr;
 308  } # function kses_hair
 309  
 310  
 311  function kses_check_attr_val($value, $vless, $checkname, $checkvalue)
 312  ###############################################################################
 313  # This function performs different checks for attribute values. The currently
 314  # implemented checks are "maxlen", "minlen", "maxval", "minval" and "valueless"
 315  # with even more checks to come soon.
 316  ###############################################################################
 317  {
 318    $ok = true;
 319  
 320    switch (strtolower($checkname))
 321    {
 322      case 'maxlen':
 323      # The maxlen check makes sure that the attribute value has a length not
 324      # greater than the given value. This can be used to avoid Buffer Overflows
 325      # in WWW clients and various Internet servers.
 326  
 327        if (strlen($value) > $checkvalue)
 328          $ok = false;
 329        break;
 330  
 331      case 'minlen':
 332      # The minlen check makes sure that the attribute value has a length not
 333      # smaller than the given value.
 334  
 335        if (strlen($value) < $checkvalue)
 336          $ok = false;
 337        break;
 338  
 339      case 'maxval':
 340      # The maxval check does two things: it checks that the attribute value is
 341      # an integer from 0 and up, without an excessive amount of zeroes or
 342      # whitespace (to avoid Buffer Overflows). It also checks that the attribute
 343      # value is not greater than the given value.
 344      # This check can be used to avoid Denial of Service attacks.
 345  
 346        if (!preg_match('/^\s{0,6}[0-9]{1,6}\s{0,6}$/', $value))
 347          $ok = false;
 348        if ($value > $checkvalue)
 349          $ok = false;
 350        break;
 351  
 352      case 'minval':
 353      # The minval check checks that the attribute value is a positive integer,
 354      # and that it is not smaller than the given value.
 355  
 356        if (!preg_match('/^\s{0,6}[0-9]{1,6}\s{0,6}$/', $value))
 357          $ok = false;
 358        if ($value < $checkvalue)
 359          $ok = false;
 360        break;
 361  
 362      case 'valueless':
 363      # The valueless check checks if the attribute has a value
 364      # (like <a href="blah">) or not (<option selected>). If the given value
 365      # is a "y" or a "Y", the attribute must not have a value.
 366      # If the given value is an "n" or an "N", the attribute must have one.
 367  
 368        if (strtolower($checkvalue) != $vless)
 369          $ok = false;
 370        break;
 371    } # switch
 372  
 373    return $ok;
 374  } # function kses_check_attr_val
 375  
 376  
 377  function kses_bad_protocol($string, $allowed_protocols)
 378  ###############################################################################
 379  # This function removes all non-allowed protocols from the beginning of
 380  # $string. It ignores whitespace and the case of the letters, and it does
 381  # understand HTML entities. It does its work in a while loop, so it won't be
 382  # fooled by a string like "javascript:javascript:alert(57)".
 383  ###############################################################################
 384  {
 385    $string = kses_no_null($string);
 386    $string = preg_replace('/([^\xc3-\xcf])\xad+/', '\\1', $string); # deals with Opera "feature" -- moodle utf8 fix 
 387    $string2 = $string.'a';
 388  
 389    while ($string != $string2)
 390    {
 391      $string2 = $string;
 392      $string = kses_bad_protocol_once($string, $allowed_protocols);
 393    } # while
 394  
 395    return $string;
 396  } # function kses_bad_protocol
 397  
 398  
 399  function kses_no_null($string)
 400  ###############################################################################
 401  # This function removes any NULL characters in $string.
 402  ###############################################################################
 403  {
 404    $string = preg_replace('/\0+/', '', $string);
 405    $string = preg_replace('/(\\\\0)+/', '', $string);
 406  
 407    return $string;
 408  } # function kses_no_null
 409  
 410  
 411  function kses_stripslashes($string)
 412  ###############################################################################
 413  # This function changes the character sequence  \"  to just  "
 414  # It leaves all other slashes alone. It's really weird, but the quoting from
 415  # preg_replace(//e) seems to require this.
 416  ###############################################################################
 417  {
 418    return preg_replace('%\\\\"%', '"', $string);
 419  } # function kses_stripslashes
 420  
 421  
 422  function kses_array_lc($inarray)
 423  ###############################################################################
 424  # This function goes through an array, and changes the keys to all lower case.
 425  ###############################################################################
 426  {
 427    $outarray = array();
 428  
 429    foreach ($inarray as $inkey => $inval)
 430    {
 431      $outkey = strtolower($inkey);
 432      $outarray[$outkey] = array();
 433  
 434      foreach ($inval as $inkey2 => $inval2)
 435      {
 436        $outkey2 = strtolower($inkey2);
 437        $outarray[$outkey][$outkey2] = $inval2;
 438      } # foreach $inval
 439    } # foreach $inarray
 440  
 441    return $outarray;
 442  } # function kses_array_lc
 443  
 444  
 445  function kses_js_entities($string)
 446  ###############################################################################
 447  # This function removes the HTML JavaScript entities found in early versions of
 448  # Netscape 4.
 449  ###############################################################################
 450  {
 451    return preg_replace('%&\s*\{[^}]*(\}\s*;?|$)%', '', $string);
 452  } # function kses_js_entities
 453  
 454  
 455  function kses_html_error($string)
 456  ###############################################################################
 457  # This function deals with parsing errors in kses_hair(). The general plan is
 458  # to remove everything to and including some whitespace, but it deals with
 459  # quotes and apostrophes as well.
 460  ###############################################################################
 461  {
 462    return preg_replace('/^("[^"]*("|$)|\'[^\']*(\'|$)|\S)*\s*/', '', $string);
 463  } # function kses_html_error
 464  
 465  
 466  function kses_bad_protocol_once($string, $allowed_protocols)
 467  ###############################################################################
 468  # This function searches for URL protocols at the beginning of $string, while
 469  # handling whitespace and HTML entities.
 470  ###############################################################################
 471  {
 472    $string2 = preg_split('/:|&#58;|&#x3a;/i', $string, 2);
 473    if(isset($string2[1]) && !preg_match('%/\?%',$string2[0]))
 474    {
 475      $string = kses_bad_protocol_once2($string2[0],$allowed_protocols).trim($string2[1]);
 476    }
 477    return $string;
 478  } # function kses_bad_protocol_once
 479  
 480  
 481  function kses_bad_protocol_once2($string, $allowed_protocols)
 482  ###############################################################################
 483  # This function processes URL protocols, checks to see if they're in the white-
 484  # list or not, and returns different data depending on the answer.
 485  ###############################################################################
 486  {
 487    $string2 = kses_decode_entities($string);
 488    $string2 = preg_replace('/\s/', '', $string2);
 489    $string2 = kses_no_null($string2);
 490    $string2 = preg_replace('/\xad+/', '', $string2);
 491     # deals with Opera "feature"
 492    $string2 = strtolower($string2);
 493  
 494    $allowed = false;
 495    foreach ($allowed_protocols as $one_protocol)
 496      if (strtolower($one_protocol) == $string2)
 497      {
 498        $allowed = true;
 499        break;
 500      }
 501  
 502    if ($allowed)
 503      return "$string2:";
 504    else
 505      return '';
 506  } # function kses_bad_protocol_once2
 507  
 508  
 509  function kses_normalize_entities($string)
 510  ###############################################################################
 511  # This function normalizes HTML entities. It will convert "AT&T" to the correct
 512  # "AT&amp;T", "&#00058;" to "&#58;", "&#XYZZY;" to "&amp;#XYZZY;" and so on.
 513  ###############################################################################
 514  {
 515  # Disarm all entities by converting & to &amp;
 516  
 517    $string = str_replace('&', '&amp;', $string);
 518  
 519  # Change back the allowed entities in our entity whitelist
 520  
 521    $string = preg_replace('/&amp;([A-Za-z][A-Za-z0-9]{0,19});/',
 522                           '&\\1;', $string);
 523    $string = preg_replace('/&amp;#0*([0-9]{1,5});/e',
 524                           'kses_normalize_entities2("\\1")', $string);
 525    $string = preg_replace('/&amp;#([Xx])0*(([0-9A-Fa-f]{2}){1,2});/',
 526                           '&#\\1\\2;', $string);
 527  
 528    return $string;
 529  } # function kses_normalize_entities
 530  
 531  
 532  function kses_normalize_entities2($i)
 533  ###############################################################################
 534  # This function helps kses_normalize_entities() to only accept 16 bit values
 535  # and nothing more for &#number; entities.
 536  ###############################################################################
 537  {
 538    return (($i > 65535) ? "&amp;#$i;" : "&#$i;");
 539  } # function kses_normalize_entities2
 540  
 541  
 542  function kses_decode_entities($string)
 543  ###############################################################################
 544  # This function decodes numeric HTML entities (&#65; and &#x41;). It doesn't
 545  # do anything with other entities like &auml;, but we don't need them in the
 546  # URL protocol whitelisting system anyway.
 547  ###############################################################################
 548  {
 549    $string = preg_replace('/&#([0-9]+);/e', 'chr("\\1")', $string);
 550    $string = preg_replace('/&#[Xx]([0-9A-Fa-f]+);/e', 'chr(hexdec("\\1"))',
 551                           $string);
 552  
 553    return $string;
 554  } # function kses_decode_entities
 555  
 556  ?>


Generated: Wed Jan 14 11:33:29 2009 Cross-referenced by PHPXref 0.7