| [ Index ] |
PHP Cross Reference of Moodle 1.9.3 [Build 15-Oct-2008] |
[Summary view] [Print] [Text view]
1 <?php 2 3 require_once 'HTMLPurifier/Lexer.php'; 4 5 HTMLPurifier_ConfigSchema::define( 6 'Core', 'DirectLexLineNumberSyncInterval', 0, 'int', ' 7 <p> 8 Specifies the number of tokens the DirectLex line number tracking 9 implementations should process before attempting to resyncronize the 10 current line count by manually counting all previous new-lines. When 11 at 0, this functionality is disabled. Lower values will decrease 12 performance, and this is only strictly necessary if the counting 13 algorithm is buggy (in which case you should report it as a bug). 14 This has no effect when %Core.MaintainLineNumbers is disabled or DirectLex is 15 not being used. This directive has been available since 2.0.0. 16 </p> 17 '); 18 19 /** 20 * Our in-house implementation of a parser. 21 * 22 * A pure PHP parser, DirectLex has absolutely no dependencies, making 23 * it a reasonably good default for PHP4. Written with efficiency in mind, 24 * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it 25 * pales in comparison to HTMLPurifier_Lexer_DOMLex. 26 * 27 * @todo Reread XML spec and document differences. 28 */ 29 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer 30 { 31 32 /** 33 * Whitespace characters for str(c)spn. 34 * @protected 35 */ 36 var $_whitespace = "\x20\x09\x0D\x0A"; 37 38 /** 39 * Callback function for script CDATA fudge 40 * @param $matches, in form of array(opening tag, contents, closing tag) 41 * @static 42 */ 43 function scriptCallback($matches) { 44 return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3]; 45 } 46 47 function tokenizeHTML($html, $config, &$context) { 48 49 // special normalization for script tags without any armor 50 // our "armor" heurstic is a < sign any number of whitespaces after 51 // the first script tag 52 if ($config->get('HTML', 'Trusted')) { 53 $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si', 54 array('HTMLPurifier_Lexer_DirectLex', 'scriptCallback'), $html); 55 } 56 57 $html = $this->normalize($html, $config, $context); 58 59 $cursor = 0; // our location in the text 60 $inside_tag = false; // whether or not we're parsing the inside of a tag 61 $array = array(); // result array 62 63 $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers'); 64 65 if ($maintain_line_numbers === null) { 66 // automatically determine line numbering by checking 67 // if error collection is on 68 $maintain_line_numbers = $config->get('Core', 'CollectErrors'); 69 } 70 71 if ($maintain_line_numbers) $current_line = 1; 72 else $current_line = false; 73 $context->register('CurrentLine', $current_line); 74 $nl = "\n"; 75 // how often to manually recalculate. This will ALWAYS be right, 76 // but it's pretty wasteful. Set to 0 to turn off 77 $synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval'); 78 79 $e = false; 80 if ($config->get('Core', 'CollectErrors')) { 81 $e =& $context->get('ErrorCollector'); 82 } 83 84 // infinite loop protection 85 // has to be pretty big, since html docs can be big 86 // we're allow two hundred thousand tags... more than enough? 87 // NOTE: this is also used for synchronization, so watch out 88 $loops = 0; 89 90 while(true) { 91 92 // infinite loop protection 93 if (++$loops > 200000) return array(); 94 95 // recalculate lines 96 if ( 97 $maintain_line_numbers && // line number tracking is on 98 $synchronize_interval && // synchronization is on 99 $cursor > 0 && // cursor is further than zero 100 $loops % $synchronize_interval === 0 // time to synchronize! 101 ) { 102 $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor); 103 } 104 105 $position_next_lt = strpos($html, '<', $cursor); 106 $position_next_gt = strpos($html, '>', $cursor); 107 108 // triggers on "<b>asdf</b>" but not "asdf <b></b>" 109 // special case to set up context 110 if ($position_next_lt === $cursor) { 111 $inside_tag = true; 112 $cursor++; 113 } 114 115 if (!$inside_tag && $position_next_lt !== false) { 116 // We are not inside tag and there still is another tag to parse 117 $token = new 118 HTMLPurifier_Token_Text( 119 $this->parseData( 120 substr( 121 $html, $cursor, $position_next_lt - $cursor 122 ) 123 ) 124 ); 125 if ($maintain_line_numbers) { 126 $token->line = $current_line; 127 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor); 128 } 129 $array[] = $token; 130 $cursor = $position_next_lt + 1; 131 $inside_tag = true; 132 continue; 133 } elseif (!$inside_tag) { 134 // We are not inside tag but there are no more tags 135 // If we're already at the end, break 136 if ($cursor === strlen($html)) break; 137 // Create Text of rest of string 138 $token = new 139 HTMLPurifier_Token_Text( 140 $this->parseData( 141 substr( 142 $html, $cursor 143 ) 144 ) 145 ); 146 if ($maintain_line_numbers) $token->line = $current_line; 147 $array[] = $token; 148 break; 149 } elseif ($inside_tag && $position_next_gt !== false) { 150 // We are in tag and it is well formed 151 // Grab the internals of the tag 152 $strlen_segment = $position_next_gt - $cursor; 153 154 if ($strlen_segment < 1) { 155 // there's nothing to process! 156 $token = new HTMLPurifier_Token_Text('<'); 157 $cursor++; 158 continue; 159 } 160 161 $segment = substr($html, $cursor, $strlen_segment); 162 163 if ($segment === false) { 164 // somehow, we attempted to access beyond the end of 165 // the string, defense-in-depth, reported by Nate Abele 166 break; 167 } 168 169 // Check if it's a comment 170 if ( 171 strncmp('!--', $segment, 3) === 0 172 ) { 173 // re-determine segment length, looking for --> 174 $position_comment_end = strpos($html, '-->', $cursor); 175 if ($position_comment_end === false) { 176 // uh oh, we have a comment that extends to 177 // infinity. Can't be helped: set comment 178 // end position to end of string 179 if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment'); 180 $position_comment_end = strlen($html); 181 $end = true; 182 } else { 183 $end = false; 184 } 185 $strlen_segment = $position_comment_end - $cursor; 186 $segment = substr($html, $cursor, $strlen_segment); 187 $token = new HTMLPurifier_Token_Comment(substr($segment, 3)); 188 if ($maintain_line_numbers) { 189 $token->line = $current_line; 190 $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment); 191 } 192 $array[] = $token; 193 $cursor = $end ? $position_comment_end : $position_comment_end + 3; 194 $inside_tag = false; 195 continue; 196 } 197 198 // Check if it's an end tag 199 $is_end_tag = (strpos($segment,'/') === 0); 200 if ($is_end_tag) { 201 $type = substr($segment, 1); 202 $token = new HTMLPurifier_Token_End($type); 203 if ($maintain_line_numbers) { 204 $token->line = $current_line; 205 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 206 } 207 $array[] = $token; 208 $inside_tag = false; 209 $cursor = $position_next_gt + 1; 210 continue; 211 } 212 213 // Check leading character is alnum, if not, we may 214 // have accidently grabbed an emoticon. Translate into 215 // text and go our merry way 216 if (!ctype_alpha($segment[0])) { 217 // XML: $segment[0] !== '_' && $segment[0] !== ':' 218 if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt'); 219 $token = new 220 HTMLPurifier_Token_Text( 221 '<' . 222 $this->parseData( 223 $segment 224 ) . 225 '>' 226 ); 227 if ($maintain_line_numbers) { 228 $token->line = $current_line; 229 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 230 } 231 $array[] = $token; 232 $cursor = $position_next_gt + 1; 233 $inside_tag = false; 234 continue; 235 } 236 237 // Check if it is explicitly self closing, if so, remove 238 // trailing slash. Remember, we could have a tag like <br>, so 239 // any later token processing scripts must convert improperly 240 // classified EmptyTags from StartTags. 241 $is_self_closing = (strrpos($segment,'/') === $strlen_segment-1); 242 if ($is_self_closing) { 243 $strlen_segment--; 244 $segment = substr($segment, 0, $strlen_segment); 245 } 246 247 // Check if there are any attributes 248 $position_first_space = strcspn($segment, $this->_whitespace); 249 250 if ($position_first_space >= $strlen_segment) { 251 if ($is_self_closing) { 252 $token = new HTMLPurifier_Token_Empty($segment); 253 } else { 254 $token = new HTMLPurifier_Token_Start($segment); 255 } 256 if ($maintain_line_numbers) { 257 $token->line = $current_line; 258 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 259 } 260 $array[] = $token; 261 $inside_tag = false; 262 $cursor = $position_next_gt + 1; 263 continue; 264 } 265 266 // Grab out all the data 267 $type = substr($segment, 0, $position_first_space); 268 $attribute_string = 269 trim( 270 substr( 271 $segment, $position_first_space 272 ) 273 ); 274 if ($attribute_string) { 275 $attr = $this->parseAttributeString( 276 $attribute_string 277 , $config, $context 278 ); 279 } else { 280 $attr = array(); 281 } 282 283 if ($is_self_closing) { 284 $token = new HTMLPurifier_Token_Empty($type, $attr); 285 } else { 286 $token = new HTMLPurifier_Token_Start($type, $attr); 287 } 288 if ($maintain_line_numbers) { 289 $token->line = $current_line; 290 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 291 } 292 $array[] = $token; 293 $cursor = $position_next_gt + 1; 294 $inside_tag = false; 295 continue; 296 } else { 297 // inside tag, but there's no ending > sign 298 if ($e) $e->send(E_WARNING, 'Lexer: Missing gt'); 299 $token = new 300 HTMLPurifier_Token_Text( 301 '<' . 302 $this->parseData( 303 substr($html, $cursor) 304 ) 305 ); 306 if ($maintain_line_numbers) $token->line = $current_line; 307 // no cursor scroll? Hmm... 308 $array[] = $token; 309 break; 310 } 311 break; 312 } 313 314 $context->destroy('CurrentLine'); 315 return $array; 316 } 317 318 /** 319 * PHP 4 compatible substr_count that implements offset and length 320 */ 321 function substrCount($haystack, $needle, $offset, $length) { 322 static $oldVersion; 323 if ($oldVersion === null) { 324 $oldVersion = version_compare(PHP_VERSION, '5.1', '<'); 325 } 326 if ($oldVersion) { 327 $haystack = substr($haystack, $offset, $length); 328 return substr_count($haystack, $needle); 329 } else { 330 return substr_count($haystack, $needle, $offset, $length); 331 } 332 } 333 334 /** 335 * Takes the inside of an HTML tag and makes an assoc array of attributes. 336 * 337 * @param $string Inside of tag excluding name. 338 * @returns Assoc array of attributes. 339 */ 340 function parseAttributeString($string, $config, &$context) { 341 $string = (string) $string; // quick typecast 342 343 if ($string == '') return array(); // no attributes 344 345 $e = false; 346 if ($config->get('Core', 'CollectErrors')) { 347 $e =& $context->get('ErrorCollector'); 348 } 349 350 // let's see if we can abort as quickly as possible 351 // one equal sign, no spaces => one attribute 352 $num_equal = substr_count($string, '='); 353 $has_space = strpos($string, ' '); 354 if ($num_equal === 0 && !$has_space) { 355 // bool attribute 356 return array($string => $string); 357 } elseif ($num_equal === 1 && !$has_space) { 358 // only one attribute 359 list($key, $quoted_value) = explode('=', $string); 360 $quoted_value = trim($quoted_value); 361 if (!$key) { 362 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key'); 363 return array(); 364 } 365 if (!$quoted_value) return array($key => ''); 366 $first_char = @$quoted_value[0]; 367 $last_char = @$quoted_value[strlen($quoted_value)-1]; 368 369 $same_quote = ($first_char == $last_char); 370 $open_quote = ($first_char == '"' || $first_char == "'"); 371 372 if ( $same_quote && $open_quote) { 373 // well behaved 374 $value = substr($quoted_value, 1, strlen($quoted_value) - 2); 375 } else { 376 // not well behaved 377 if ($open_quote) { 378 if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote'); 379 $value = substr($quoted_value, 1); 380 } else { 381 $value = $quoted_value; 382 } 383 } 384 if ($value === false) $value = ''; 385 return array($key => $value); 386 } 387 388 // setup loop environment 389 $array = array(); // return assoc array of attributes 390 $cursor = 0; // current position in string (moves forward) 391 $size = strlen($string); // size of the string (stays the same) 392 393 // if we have unquoted attributes, the parser expects a terminating 394 // space, so let's guarantee that there's always a terminating space. 395 $string .= ' '; 396 397 // infinite loop protection 398 $loops = 0; 399 while(true) { 400 401 // infinite loop protection 402 if (++$loops > 1000) { 403 trigger_error('Infinite loop detected in attribute parsing', E_USER_WARNING); 404 return array(); 405 } 406 407 if ($cursor >= $size) { 408 break; 409 } 410 411 $cursor += ($value = strspn($string, $this->_whitespace, $cursor)); 412 // grab the key 413 414 $key_begin = $cursor; //we're currently at the start of the key 415 416 // scroll past all characters that are the key (not whitespace or =) 417 $cursor += strcspn($string, $this->_whitespace . '=', $cursor); 418 419 $key_end = $cursor; // now at the end of the key 420 421 $key = substr($string, $key_begin, $key_end - $key_begin); 422 423 if (!$key) { 424 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key'); 425 $cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop 426 continue; // empty key 427 } 428 429 // scroll past all whitespace 430 $cursor += strspn($string, $this->_whitespace, $cursor); 431 432 if ($cursor >= $size) { 433 $array[$key] = $key; 434 break; 435 } 436 437 // if the next character is an equal sign, we've got a regular 438 // pair, otherwise, it's a bool attribute 439 $first_char = @$string[$cursor]; 440 441 if ($first_char == '=') { 442 // key="value" 443 444 $cursor++; 445 $cursor += strspn($string, $this->_whitespace, $cursor); 446 447 if ($cursor === false) { 448 $array[$key] = ''; 449 break; 450 } 451 452 // we might be in front of a quote right now 453 454 $char = @$string[$cursor]; 455 456 if ($char == '"' || $char == "'") { 457 // it's quoted, end bound is $char 458 $cursor++; 459 $value_begin = $cursor; 460 $cursor = strpos($string, $char, $cursor); 461 $value_end = $cursor; 462 } else { 463 // it's not quoted, end bound is whitespace 464 $value_begin = $cursor; 465 $cursor += strcspn($string, $this->_whitespace, $cursor); 466 $value_end = $cursor; 467 } 468 469 // we reached a premature end 470 if ($cursor === false) { 471 $cursor = $size; 472 $value_end = $cursor; 473 } 474 475 $value = substr($string, $value_begin, $value_end - $value_begin); 476 if ($value === false) $value = ''; 477 $array[$key] = $this->parseData($value); 478 $cursor++; 479 480 } else { 481 // boolattr 482 if ($key !== '') { 483 $array[$key] = $key; 484 } else { 485 // purely theoretical 486 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key'); 487 } 488 489 } 490 } 491 return $array; 492 } 493 494 } 495
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
| Generated: Wed Jan 14 11:33:29 2009 | Cross-referenced by PHPXref 0.7 |