[ Index ]

PHP Cross Reference of Moodle 1.9.3 [Build 15-Oct-2008]

title

Body

[close]

/lib/htmlpurifier/HTMLPurifier/Lexer/ -> DirectLex.php (source)

   1  <?php
   2  
   3  require_once  'HTMLPurifier/Lexer.php';
   4  
   5  HTMLPurifier_ConfigSchema::define(
   6      'Core', 'DirectLexLineNumberSyncInterval', 0, 'int', '
   7  <p>
   8    Specifies the number of tokens the DirectLex line number tracking
   9    implementations should process before attempting to resyncronize the
  10    current line count by manually counting all previous new-lines. When
  11    at 0, this functionality is disabled. Lower values will decrease
  12    performance, and this is only strictly necessary if the counting
  13    algorithm is buggy (in which case you should report it as a bug).
  14    This has no effect when %Core.MaintainLineNumbers is disabled or DirectLex is
  15    not being used. This directive has been available since 2.0.0.
  16  </p>
  17  ');
  18  
  19  /**
  20   * Our in-house implementation of a parser.
  21   * 
  22   * A pure PHP parser, DirectLex has absolutely no dependencies, making
  23   * it a reasonably good default for PHP4.  Written with efficiency in mind,
  24   * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
  25   * pales in comparison to HTMLPurifier_Lexer_DOMLex.
  26   * 
  27   * @todo Reread XML spec and document differences.
  28   */
  29  class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
  30  {
  31      
  32      /**
  33       * Whitespace characters for str(c)spn.
  34       * @protected
  35       */
  36      var $_whitespace = "\x20\x09\x0D\x0A";
  37      
  38      /**
  39       * Callback function for script CDATA fudge
  40       * @param $matches, in form of array(opening tag, contents, closing tag)
  41       * @static
  42       */
  43      function scriptCallback($matches) {
  44          return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
  45      }
  46      
  47      function tokenizeHTML($html, $config, &$context) {
  48          
  49          // special normalization for script tags without any armor
  50          // our "armor" heurstic is a < sign any number of whitespaces after
  51          // the first script tag
  52          if ($config->get('HTML', 'Trusted')) {
  53              $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
  54                  array('HTMLPurifier_Lexer_DirectLex', 'scriptCallback'), $html);
  55          }
  56          
  57          $html = $this->normalize($html, $config, $context);
  58          
  59          $cursor = 0; // our location in the text
  60          $inside_tag = false; // whether or not we're parsing the inside of a tag
  61          $array = array(); // result array
  62          
  63          $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
  64          
  65          if ($maintain_line_numbers === null) {
  66              // automatically determine line numbering by checking
  67              // if error collection is on
  68              $maintain_line_numbers = $config->get('Core', 'CollectErrors');
  69          }
  70          
  71          if ($maintain_line_numbers) $current_line = 1;
  72          else $current_line = false;
  73          $context->register('CurrentLine', $current_line);
  74          $nl = "\n";
  75          // how often to manually recalculate. This will ALWAYS be right,
  76          // but it's pretty wasteful. Set to 0 to turn off
  77          $synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval'); 
  78          
  79          $e = false;
  80          if ($config->get('Core', 'CollectErrors')) {
  81              $e =& $context->get('ErrorCollector');
  82          }
  83          
  84          // infinite loop protection
  85          // has to be pretty big, since html docs can be big
  86          // we're allow two hundred thousand tags... more than enough?
  87          // NOTE: this is also used for synchronization, so watch out
  88          $loops = 0;
  89          
  90          while(true) {
  91              
  92              // infinite loop protection
  93              if (++$loops > 200000) return array();
  94              
  95              // recalculate lines
  96              if (
  97                  $maintain_line_numbers && // line number tracking is on
  98                  $synchronize_interval &&  // synchronization is on
  99                  $cursor > 0 &&            // cursor is further than zero
 100                  $loops % $synchronize_interval === 0 // time to synchronize!
 101              ) {
 102                  $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
 103              }
 104              
 105              $position_next_lt = strpos($html, '<', $cursor);
 106              $position_next_gt = strpos($html, '>', $cursor);
 107              
 108              // triggers on "<b>asdf</b>" but not "asdf <b></b>"
 109              // special case to set up context
 110              if ($position_next_lt === $cursor) {
 111                  $inside_tag = true;
 112                  $cursor++;
 113              }
 114              
 115              if (!$inside_tag && $position_next_lt !== false) {
 116                  // We are not inside tag and there still is another tag to parse
 117                  $token = new
 118                      HTMLPurifier_Token_Text(
 119                          $this->parseData(
 120                              substr(
 121                                  $html, $cursor, $position_next_lt - $cursor
 122                              )
 123                          )
 124                      );
 125                  if ($maintain_line_numbers) {
 126                      $token->line = $current_line;
 127                      $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
 128                  }
 129                  $array[] = $token;
 130                  $cursor  = $position_next_lt + 1;
 131                  $inside_tag = true;
 132                  continue;
 133              } elseif (!$inside_tag) {
 134                  // We are not inside tag but there are no more tags
 135                  // If we're already at the end, break
 136                  if ($cursor === strlen($html)) break;
 137                  // Create Text of rest of string
 138                  $token = new
 139                      HTMLPurifier_Token_Text(
 140                          $this->parseData(
 141                              substr(
 142                                  $html, $cursor
 143                              )
 144                          )
 145                      );
 146                  if ($maintain_line_numbers) $token->line = $current_line;
 147                  $array[] = $token;
 148                  break;
 149              } elseif ($inside_tag && $position_next_gt !== false) {
 150                  // We are in tag and it is well formed
 151                  // Grab the internals of the tag
 152                  $strlen_segment = $position_next_gt - $cursor;
 153                  
 154                  if ($strlen_segment < 1) {
 155                      // there's nothing to process!
 156                      $token = new HTMLPurifier_Token_Text('<');
 157                      $cursor++;
 158                      continue;
 159                  }
 160                  
 161                  $segment = substr($html, $cursor, $strlen_segment);
 162                  
 163                  if ($segment === false) {
 164                      // somehow, we attempted to access beyond the end of
 165                      // the string, defense-in-depth, reported by Nate Abele
 166                      break;
 167                  }
 168                  
 169                  // Check if it's a comment
 170                  if (
 171                      strncmp('!--', $segment, 3) === 0
 172                  ) {
 173                      // re-determine segment length, looking for -->
 174                      $position_comment_end = strpos($html, '-->', $cursor);
 175                      if ($position_comment_end === false) {
 176                          // uh oh, we have a comment that extends to
 177                          // infinity. Can't be helped: set comment
 178                          // end position to end of string
 179                          if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment');
 180                          $position_comment_end = strlen($html);
 181                          $end = true;
 182                      } else {
 183                          $end = false;
 184                      }
 185                      $strlen_segment = $position_comment_end - $cursor;
 186                      $segment = substr($html, $cursor, $strlen_segment);
 187                      $token = new HTMLPurifier_Token_Comment(substr($segment, 3));
 188                      if ($maintain_line_numbers) {
 189                          $token->line = $current_line;
 190                          $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
 191                      }
 192                      $array[] = $token;
 193                      $cursor = $end ? $position_comment_end : $position_comment_end + 3;
 194                      $inside_tag = false;
 195                      continue;
 196                  }
 197                  
 198                  // Check if it's an end tag
 199                  $is_end_tag = (strpos($segment,'/') === 0);
 200                  if ($is_end_tag) {
 201                      $type = substr($segment, 1);
 202                      $token = new HTMLPurifier_Token_End($type);
 203                      if ($maintain_line_numbers) {
 204                          $token->line = $current_line;
 205                          $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 206                      }
 207                      $array[] = $token;
 208                      $inside_tag = false;
 209                      $cursor = $position_next_gt + 1;
 210                      continue;
 211                  }
 212                  
 213                  // Check leading character is alnum, if not, we may
 214                  // have accidently grabbed an emoticon. Translate into
 215                  // text and go our merry way
 216                  if (!ctype_alpha($segment[0])) {
 217                      // XML:  $segment[0] !== '_' && $segment[0] !== ':'
 218                      if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
 219                      $token = new
 220                          HTMLPurifier_Token_Text(
 221                              '<' .
 222                              $this->parseData(
 223                                  $segment
 224                              ) . 
 225                              '>'
 226                          );
 227                      if ($maintain_line_numbers) {
 228                          $token->line = $current_line;
 229                          $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 230                      }
 231                      $array[] = $token;
 232                      $cursor = $position_next_gt + 1;
 233                      $inside_tag = false;
 234                      continue;
 235                  }
 236                  
 237                  // Check if it is explicitly self closing, if so, remove
 238                  // trailing slash. Remember, we could have a tag like <br>, so
 239                  // any later token processing scripts must convert improperly
 240                  // classified EmptyTags from StartTags.
 241                  $is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
 242                  if ($is_self_closing) {
 243                      $strlen_segment--;
 244                      $segment = substr($segment, 0, $strlen_segment);
 245                  }
 246                  
 247                  // Check if there are any attributes
 248                  $position_first_space = strcspn($segment, $this->_whitespace);
 249                  
 250                  if ($position_first_space >= $strlen_segment) {
 251                      if ($is_self_closing) {
 252                          $token = new HTMLPurifier_Token_Empty($segment);
 253                      } else {
 254                          $token = new HTMLPurifier_Token_Start($segment);
 255                      }
 256                      if ($maintain_line_numbers) {
 257                          $token->line = $current_line;
 258                          $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 259                      }
 260                      $array[] = $token;
 261                      $inside_tag = false;
 262                      $cursor = $position_next_gt + 1;
 263                      continue;
 264                  }
 265                  
 266                  // Grab out all the data
 267                  $type = substr($segment, 0, $position_first_space);
 268                  $attribute_string =
 269                      trim(
 270                          substr(
 271                              $segment, $position_first_space
 272                          )
 273                      );
 274                  if ($attribute_string) {
 275                      $attr = $this->parseAttributeString(
 276                                      $attribute_string
 277                                    , $config, $context
 278                                );
 279                  } else {
 280                      $attr = array();
 281                  }
 282                  
 283                  if ($is_self_closing) {
 284                      $token = new HTMLPurifier_Token_Empty($type, $attr);
 285                  } else {
 286                      $token = new HTMLPurifier_Token_Start($type, $attr);
 287                  }
 288                  if ($maintain_line_numbers) {
 289                      $token->line = $current_line;
 290                      $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 291                  }
 292                  $array[] = $token;
 293                  $cursor = $position_next_gt + 1;
 294                  $inside_tag = false;
 295                  continue;
 296              } else {
 297                  // inside tag, but there's no ending > sign
 298                  if ($e) $e->send(E_WARNING, 'Lexer: Missing gt');
 299                  $token = new
 300                      HTMLPurifier_Token_Text(
 301                          '<' .
 302                          $this->parseData(
 303                              substr($html, $cursor)
 304                          )
 305                      );
 306                  if ($maintain_line_numbers) $token->line = $current_line;
 307                  // no cursor scroll? Hmm...
 308                  $array[] = $token;
 309                  break;
 310              }
 311              break;
 312          }
 313          
 314          $context->destroy('CurrentLine');
 315          return $array;
 316      }
 317      
 318      /**
 319       * PHP 4 compatible substr_count that implements offset and length
 320       */
 321      function substrCount($haystack, $needle, $offset, $length) {
 322          static $oldVersion;
 323          if ($oldVersion === null) {
 324              $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
 325          }
 326          if ($oldVersion) {
 327              $haystack = substr($haystack, $offset, $length);
 328              return substr_count($haystack, $needle);
 329          } else {
 330              return substr_count($haystack, $needle, $offset, $length);
 331          }
 332      }
 333      
 334      /**
 335       * Takes the inside of an HTML tag and makes an assoc array of attributes.
 336       * 
 337       * @param $string Inside of tag excluding name.
 338       * @returns Assoc array of attributes.
 339       */
 340      function parseAttributeString($string, $config, &$context) {
 341          $string = (string) $string; // quick typecast
 342          
 343          if ($string == '') return array(); // no attributes
 344          
 345          $e = false;
 346          if ($config->get('Core', 'CollectErrors')) {
 347              $e =& $context->get('ErrorCollector');
 348          }
 349          
 350          // let's see if we can abort as quickly as possible
 351          // one equal sign, no spaces => one attribute
 352          $num_equal = substr_count($string, '=');
 353          $has_space = strpos($string, ' ');
 354          if ($num_equal === 0 && !$has_space) {
 355              // bool attribute
 356              return array($string => $string);
 357          } elseif ($num_equal === 1 && !$has_space) {
 358              // only one attribute
 359              list($key, $quoted_value) = explode('=', $string);
 360              $quoted_value = trim($quoted_value);
 361              if (!$key) {
 362                  if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
 363                  return array();
 364              }
 365              if (!$quoted_value) return array($key => '');
 366              $first_char = @$quoted_value[0];
 367              $last_char  = @$quoted_value[strlen($quoted_value)-1];
 368              
 369              $same_quote = ($first_char == $last_char);
 370              $open_quote = ($first_char == '"' || $first_char == "'");
 371              
 372              if ( $same_quote && $open_quote) {
 373                  // well behaved
 374                  $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
 375              } else {
 376                  // not well behaved
 377                  if ($open_quote) {
 378                      if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote');
 379                      $value = substr($quoted_value, 1);
 380                  } else {
 381                      $value = $quoted_value;
 382                  }
 383              }
 384              if ($value === false) $value = '';
 385              return array($key => $value);
 386          }
 387          
 388          // setup loop environment
 389          $array  = array(); // return assoc array of attributes
 390          $cursor = 0; // current position in string (moves forward)
 391          $size   = strlen($string); // size of the string (stays the same)
 392          
 393          // if we have unquoted attributes, the parser expects a terminating
 394          // space, so let's guarantee that there's always a terminating space.
 395          $string .= ' ';
 396          
 397          // infinite loop protection
 398          $loops = 0;
 399          while(true) {
 400              
 401              // infinite loop protection
 402              if (++$loops > 1000) {
 403                  trigger_error('Infinite loop detected in attribute parsing', E_USER_WARNING);
 404                  return array();
 405              }
 406              
 407              if ($cursor >= $size) {
 408                  break;
 409              }
 410              
 411              $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
 412              // grab the key
 413              
 414              $key_begin = $cursor; //we're currently at the start of the key
 415              
 416              // scroll past all characters that are the key (not whitespace or =)
 417              $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
 418              
 419              $key_end = $cursor; // now at the end of the key
 420              
 421              $key = substr($string, $key_begin, $key_end - $key_begin);
 422              
 423              if (!$key) {
 424                  if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
 425                  $cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
 426                  continue; // empty key
 427              }
 428              
 429              // scroll past all whitespace
 430              $cursor += strspn($string, $this->_whitespace, $cursor);
 431              
 432              if ($cursor >= $size) {
 433                  $array[$key] = $key;
 434                  break;
 435              }
 436              
 437              // if the next character is an equal sign, we've got a regular
 438              // pair, otherwise, it's a bool attribute
 439              $first_char = @$string[$cursor];
 440              
 441              if ($first_char == '=') {
 442                  // key="value"
 443                  
 444                  $cursor++;
 445                  $cursor += strspn($string, $this->_whitespace, $cursor);
 446                  
 447                  if ($cursor === false) {
 448                      $array[$key] = '';
 449                      break;
 450                  }
 451                  
 452                  // we might be in front of a quote right now
 453                  
 454                  $char = @$string[$cursor];
 455                  
 456                  if ($char == '"' || $char == "'") {
 457                      // it's quoted, end bound is $char
 458                      $cursor++;
 459                      $value_begin = $cursor;
 460                      $cursor = strpos($string, $char, $cursor);
 461                      $value_end = $cursor;
 462                  } else {
 463                      // it's not quoted, end bound is whitespace
 464                      $value_begin = $cursor;
 465                      $cursor += strcspn($string, $this->_whitespace, $cursor);
 466                      $value_end = $cursor;
 467                  }
 468                  
 469                  // we reached a premature end
 470                  if ($cursor === false) {
 471                      $cursor = $size;
 472                      $value_end = $cursor;
 473                  }
 474                  
 475                  $value = substr($string, $value_begin, $value_end - $value_begin);
 476                  if ($value === false) $value = '';
 477                  $array[$key] = $this->parseData($value);
 478                  $cursor++;
 479                  
 480              } else {
 481                  // boolattr
 482                  if ($key !== '') {
 483                      $array[$key] = $key;
 484                  } else {
 485                      // purely theoretical
 486                      if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
 487                  }
 488                  
 489              }
 490          }
 491          return $array;
 492      }
 493      
 494  }
 495  


Generated: Wed Jan 14 11:33:29 2009 Cross-referenced by PHPXref 0.7