| [ Index ] |
PHP Cross Reference of Moodle 1.9.3 [Build 15-Oct-2008] |
[Summary view] [Print] [Text view]
1 <?php 2 3 require_once 'HTMLPurifier/Lexer/DOMLex.php'; 4 5 /** 6 * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library. 7 * Requires PHP5, and occupies space in the HTML5 pseudo-namespace (may 8 * cause conflicts, sorry). 9 */ 10 11 class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex { 12 13 public function tokenizeHTML($html, $config, &$context) { 14 $html = $this->normalize($html, $config, $context); 15 $html = $this->wrapHTML( $html, $config, $context); 16 $parser = new HTML5($html); 17 $doc = $parser->save(); 18 $tokens = array(); 19 $this->tokenizeDOM( 20 $doc->getElementsByTagName('html')->item(0)-> // <html> 21 getElementsByTagName('body')->item(0)-> // <body> 22 getElementsByTagName('div')->item(0) // <div> 23 , $tokens); 24 return $tokens; 25 } 26 27 } 28 29 /* 30 31 Copyright 2007 Jeroen van der Meer <http://jero.net/> 32 33 Permission is hereby granted, free of charge, to any person obtaining a 34 copy of this software and associated documentation files (the 35 "Software"), to deal in the Software without restriction, including 36 without limitation the rights to use, copy, modify, merge, publish, 37 distribute, sublicense, and/or sell copies of the Software, and to 38 permit persons to whom the Software is furnished to do so, subject to 39 the following conditions: 40 41 The above copyright notice and this permission notice shall be included 42 in all copies or substantial portions of the Software. 43 44 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 45 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 46 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 47 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 48 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 49 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 50 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 51 52 */ 53 54 class HTML5 { 55 private $data; 56 private $char; 57 private $EOF; 58 private $state; 59 private $tree; 60 private $token; 61 private $content_model; 62 private $escape = false; 63 private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute', 64 'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;', 65 'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;', 66 'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;', 67 'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;', 68 'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;', 69 'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;', 70 'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;', 71 'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;', 72 'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN', 73 'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;', 74 'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;', 75 'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig', 76 'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;', 77 'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;', 78 'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil', 79 'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;', 80 'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;', 81 'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;', 82 'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth', 83 'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12', 84 'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt', 85 'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc', 86 'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;', 87 'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;', 88 'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;', 89 'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro', 90 'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;', 91 'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;', 92 'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;', 93 'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash', 94 'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;', 95 'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;', 96 'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;', 97 'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;', 98 'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;', 99 'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;', 100 'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;', 101 'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;', 102 'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc', 103 'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;', 104 'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;'); 105 106 const PCDATA = 0; 107 const RCDATA = 1; 108 const CDATA = 2; 109 const PLAINTEXT = 3; 110 111 const DOCTYPE = 0; 112 const STARTTAG = 1; 113 const ENDTAG = 2; 114 const COMMENT = 3; 115 const CHARACTR = 4; 116 const EOF = 5; 117 118 public function __construct($data) { 119 $data = str_replace("\r\n", "\n", $data); 120 $date = str_replace("\r", null, $data); 121 122 $this->data = $data; 123 $this->char = -1; 124 $this->EOF = strlen($data); 125 $this->tree = new HTML5TreeConstructer; 126 $this->content_model = self::PCDATA; 127 128 $this->state = 'data'; 129 130 while($this->state !== null) { 131 $this->{$this->state.'State'}(); 132 } 133 } 134 135 public function save() { 136 return $this->tree->save(); 137 } 138 139 private function char() { 140 return ($this->char < $this->EOF) 141 ? $this->data[$this->char] 142 : false; 143 } 144 145 private function character($s, $l = 0) { 146 if($s + $l < $this->EOF) { 147 if($l === 0) { 148 return $this->data[$s]; 149 } else { 150 return substr($this->data, $s, $l); 151 } 152 } 153 } 154 155 private function characters($char_class, $start) { 156 return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start)); 157 } 158 159 private function dataState() { 160 // Consume the next input character 161 $this->char++; 162 $char = $this->char(); 163 164 if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) { 165 /* U+0026 AMPERSAND (&) 166 When the content model flag is set to one of the PCDATA or RCDATA 167 states: switch to the entity data state. Otherwise: treat it as per 168 the "anything else" entry below. */ 169 $this->state = 'entityData'; 170 171 } elseif($char === '-') { 172 /* If the content model flag is set to either the RCDATA state or 173 the CDATA state, and the escape flag is false, and there are at 174 least three characters before this one in the input stream, and the 175 last four characters in the input stream, including this one, are 176 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS, 177 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */ 178 if(($this->content_model === self::RCDATA || $this->content_model === 179 self::CDATA) && $this->escape === false && 180 $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--') { 181 $this->escape = true; 182 } 183 184 /* In any case, emit the input character as a character token. Stay 185 in the data state. */ 186 $this->emitToken(array( 187 'type' => self::CHARACTR, 188 'data' => $char 189 )); 190 191 /* U+003C LESS-THAN SIGN (<) */ 192 } elseif($char === '<' && ($this->content_model === self::PCDATA || 193 (($this->content_model === self::RCDATA || 194 $this->content_model === self::CDATA) && $this->escape === false))) { 195 /* When the content model flag is set to the PCDATA state: switch 196 to the tag open state. 197 198 When the content model flag is set to either the RCDATA state or 199 the CDATA state and the escape flag is false: switch to the tag 200 open state. 201 202 Otherwise: treat it as per the "anything else" entry below. */ 203 $this->state = 'tagOpen'; 204 205 /* U+003E GREATER-THAN SIGN (>) */ 206 } elseif($char === '>') { 207 /* If the content model flag is set to either the RCDATA state or 208 the CDATA state, and the escape flag is true, and the last three 209 characters in the input stream including this one are U+002D 210 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"), 211 set the escape flag to false. */ 212 if(($this->content_model === self::RCDATA || 213 $this->content_model === self::CDATA) && $this->escape === true && 214 $this->character($this->char, 3) === '-->') { 215 $this->escape = false; 216 } 217 218 /* In any case, emit the input character as a character token. 219 Stay in the data state. */ 220 $this->emitToken(array( 221 'type' => self::CHARACTR, 222 'data' => $char 223 )); 224 225 } elseif($this->char === $this->EOF) { 226 /* EOF 227 Emit an end-of-file token. */ 228 $this->EOF(); 229 230 } elseif($this->content_model === self::PLAINTEXT) { 231 /* When the content model flag is set to the PLAINTEXT state 232 THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of 233 the text and emit it as a character token. */ 234 $this->emitToken(array( 235 'type' => self::CHARACTR, 236 'data' => substr($this->data, $this->char) 237 )); 238 239 $this->EOF(); 240 241 } else { 242 /* Anything else 243 THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that 244 otherwise would also be treated as a character token and emit it 245 as a single character token. Stay in the data state. */ 246 $len = strcspn($this->data, '<&', $this->char); 247 $char = substr($this->data, $this->char, $len); 248 $this->char += $len - 1; 249 250 $this->emitToken(array( 251 'type' => self::CHARACTR, 252 'data' => $char 253 )); 254 255 $this->state = 'data'; 256 } 257 } 258 259 private function entityDataState() { 260 // Attempt to consume an entity. 261 $entity = $this->entity(); 262 263 // If nothing is returned, emit a U+0026 AMPERSAND character token. 264 // Otherwise, emit the character token that was returned. 265 $char = (!$entity) ? '&' : $entity; 266 $this->emitToken(array( 267 'type' => self::CHARACTR, 268 'data' => $char 269 )); 270 271 // Finally, switch to the data state. 272 $this->state = 'data'; 273 } 274 275 private function tagOpenState() { 276 switch($this->content_model) { 277 case self::RCDATA: 278 case self::CDATA: 279 /* If the next input character is a U+002F SOLIDUS (/) character, 280 consume it and switch to the close tag open state. If the next 281 input character is not a U+002F SOLIDUS (/) character, emit a 282 U+003C LESS-THAN SIGN character token and switch to the data 283 state to process the next input character. */ 284 if($this->character($this->char + 1) === '/') { 285 $this->char++; 286 $this->state = 'closeTagOpen'; 287 288 } else { 289 $this->emitToken(array( 290 'type' => self::CHARACTR, 291 'data' => '<' 292 )); 293 294 $this->state = 'data'; 295 } 296 break; 297 298 case self::PCDATA: 299 // If the content model flag is set to the PCDATA state 300 // Consume the next input character: 301 $this->char++; 302 $char = $this->char(); 303 304 if($char === '!') { 305 /* U+0021 EXCLAMATION MARK (!) 306 Switch to the markup declaration open state. */ 307 $this->state = 'markupDeclarationOpen'; 308 309 } elseif($char === '/') { 310 /* U+002F SOLIDUS (/) 311 Switch to the close tag open state. */ 312 $this->state = 'closeTagOpen'; 313 314 } elseif(preg_match('/^[A-Za-z]$/', $char)) { 315 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z 316 Create a new start tag token, set its tag name to the lowercase 317 version of the input character (add 0x0020 to the character's code 318 point), then switch to the tag name state. (Don't emit the token 319 yet; further details will be filled in before it is emitted.) */ 320 $this->token = array( 321 'name' => strtolower($char), 322 'type' => self::STARTTAG, 323 'attr' => array() 324 ); 325 326 $this->state = 'tagName'; 327 328 } elseif($char === '>') { 329 /* U+003E GREATER-THAN SIGN (>) 330 Parse error. Emit a U+003C LESS-THAN SIGN character token and a 331 U+003E GREATER-THAN SIGN character token. Switch to the data state. */ 332 $this->emitToken(array( 333 'type' => self::CHARACTR, 334 'data' => '<>' 335 )); 336 337 $this->state = 'data'; 338 339 } elseif($char === '?') { 340 /* U+003F QUESTION MARK (?) 341 Parse error. Switch to the bogus comment state. */ 342 $this->state = 'bogusComment'; 343 344 } else { 345 /* Anything else 346 Parse error. Emit a U+003C LESS-THAN SIGN character token and 347 reconsume the current input character in the data state. */ 348 $this->emitToken(array( 349 'type' => self::CHARACTR, 350 'data' => '<' 351 )); 352 353 $this->char--; 354 $this->state = 'data'; 355 } 356 break; 357 } 358 } 359 360 private function closeTagOpenState() { 361 $next_node = strtolower($this->characters('A-Za-z', $this->char + 1)); 362 $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName; 363 364 if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) && 365 (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/', 366 $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) { 367 /* If the content model flag is set to the RCDATA or CDATA states then 368 examine the next few characters. If they do not match the tag name of 369 the last start tag token emitted (case insensitively), or if they do but 370 they are not immediately followed by one of the following characters: 371 * U+0009 CHARACTER TABULATION 372 * U+000A LINE FEED (LF) 373 * U+000B LINE TABULATION 374 * U+000C FORM FEED (FF) 375 * U+0020 SPACE 376 * U+003E GREATER-THAN SIGN (>) 377 * U+002F SOLIDUS (/) 378 * EOF 379 ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character 380 token, a U+002F SOLIDUS character token, and switch to the data state 381 to process the next input character. */ 382 $this->emitToken(array( 383 'type' => self::CHARACTR, 384 'data' => '</' 385 )); 386 387 $this->state = 'data'; 388 389 } else { 390 /* Otherwise, if the content model flag is set to the PCDATA state, 391 or if the next few characters do match that tag name, consume the 392 next input character: */ 393 $this->char++; 394 $char = $this->char(); 395 396 if(preg_match('/^[A-Za-z]$/', $char)) { 397 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z 398 Create a new end tag token, set its tag name to the lowercase version 399 of the input character (add 0x0020 to the character's code point), then 400 switch to the tag name state. (Don't emit the token yet; further details 401 will be filled in before it is emitted.) */ 402 $this->token = array( 403 'name' => strtolower($char), 404 'type' => self::ENDTAG 405 ); 406 407 $this->state = 'tagName'; 408 409 } elseif($char === '>') { 410 /* U+003E GREATER-THAN SIGN (>) 411 Parse error. Switch to the data state. */ 412 $this->state = 'data'; 413 414 } elseif($this->char === $this->EOF) { 415 /* EOF 416 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F 417 SOLIDUS character token. Reconsume the EOF character in the data state. */ 418 $this->emitToken(array( 419 'type' => self::CHARACTR, 420 'data' => '</' 421 )); 422 423 $this->char--; 424 $this->state = 'data'; 425 426 } else { 427 /* Parse error. Switch to the bogus comment state. */ 428 $this->state = 'bogusComment'; 429 } 430 } 431 } 432 433 private function tagNameState() { 434 // Consume the next input character: 435 $this->char++; 436 $char = $this->character($this->char); 437 438 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 439 /* U+0009 CHARACTER TABULATION 440 U+000A LINE FEED (LF) 441 U+000B LINE TABULATION 442 U+000C FORM FEED (FF) 443 U+0020 SPACE 444 Switch to the before attribute name state. */ 445 $this->state = 'beforeAttributeName'; 446 447 } elseif($char === '>') { 448 /* U+003E GREATER-THAN SIGN (>) 449 Emit the current tag token. Switch to the data state. */ 450 $this->emitToken($this->token); 451 $this->state = 'data'; 452 453 } elseif($this->char === $this->EOF) { 454 /* EOF 455 Parse error. Emit the current tag token. Reconsume the EOF 456 character in the data state. */ 457 $this->emitToken($this->token); 458 459 $this->char--; 460 $this->state = 'data'; 461 462 } elseif($char === '/') { 463 /* U+002F SOLIDUS (/) 464 Parse error unless this is a permitted slash. Switch to the before 465 attribute name state. */ 466 $this->state = 'beforeAttributeName'; 467 468 } else { 469 /* Anything else 470 Append the current input character to the current tag token's tag name. 471 Stay in the tag name state. */ 472 $this->token['name'] .= strtolower($char); 473 $this->state = 'tagName'; 474 } 475 } 476 477 private function beforeAttributeNameState() { 478 // Consume the next input character: 479 $this->char++; 480 $char = $this->character($this->char); 481 482 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 483 /* U+0009 CHARACTER TABULATION 484 U+000A LINE FEED (LF) 485 U+000B LINE TABULATION 486 U+000C FORM FEED (FF) 487 U+0020 SPACE 488 Stay in the before attribute name state. */ 489 $this->state = 'beforeAttributeName'; 490 491 } elseif($char === '>') { 492 /* U+003E GREATER-THAN SIGN (>) 493 Emit the current tag token. Switch to the data state. */ 494 $this->emitToken($this->token); 495 $this->state = 'data'; 496 497 } elseif($char === '/') { 498 /* U+002F SOLIDUS (/) 499 Parse error unless this is a permitted slash. Stay in the before 500 attribute name state. */ 501 $this->state = 'beforeAttributeName'; 502 503 } elseif($this->char === $this->EOF) { 504 /* EOF 505 Parse error. Emit the current tag token. Reconsume the EOF 506 character in the data state. */ 507 $this->emitToken($this->token); 508 509 $this->char--; 510 $this->state = 'data'; 511 512 } else { 513 /* Anything else 514 Start a new attribute in the current tag token. Set that attribute's 515 name to the current input character, and its value to the empty string. 516 Switch to the attribute name state. */ 517 $this->token['attr'][] = array( 518 'name' => strtolower($char), 519 'value' => null 520 ); 521 522 $this->state = 'attributeName'; 523 } 524 } 525 526 private function attributeNameState() { 527 // Consume the next input character: 528 $this->char++; 529 $char = $this->character($this->char); 530 531 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 532 /* U+0009 CHARACTER TABULATION 533 U+000A LINE FEED (LF) 534 U+000B LINE TABULATION 535 U+000C FORM FEED (FF) 536 U+0020 SPACE 537 Stay in the before attribute name state. */ 538 $this->state = 'afterAttributeName'; 539 540 } elseif($char === '=') { 541 /* U+003D EQUALS SIGN (=) 542 Switch to the before attribute value state. */ 543 $this->state = 'beforeAttributeValue'; 544 545 } elseif($char === '>') { 546 /* U+003E GREATER-THAN SIGN (>) 547 Emit the current tag token. Switch to the data state. */ 548 $this->emitToken($this->token); 549 $this->state = 'data'; 550 551 } elseif($char === '/' && $this->character($this->char + 1) !== '>') { 552 /* U+002F SOLIDUS (/) 553 Parse error unless this is a permitted slash. Switch to the before 554 attribute name state. */ 555 $this->state = 'beforeAttributeName'; 556 557 } elseif($this->char === $this->EOF) { 558 /* EOF 559 Parse error. Emit the current tag token. Reconsume the EOF 560 character in the data state. */ 561 $this->emitToken($this->token); 562 563 $this->char--; 564 $this->state = 'data'; 565 566 } else { 567 /* Anything else 568 Append the current input character to the current attribute's name. 569 Stay in the attribute name state. */ 570 $last = count($this->token['attr']) - 1; 571 $this->token['attr'][$last]['name'] .= strtolower($char); 572 573 $this->state = 'attributeName'; 574 } 575 } 576 577 private function afterAttributeNameState() { 578 // Consume the next input character: 579 $this->char++; 580 $char = $this->character($this->char); 581 582 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 583 /* U+0009 CHARACTER TABULATION 584 U+000A LINE FEED (LF) 585 U+000B LINE TABULATION 586 U+000C FORM FEED (FF) 587 U+0020 SPACE 588 Stay in the after attribute name state. */ 589 $this->state = 'afterAttributeName'; 590 591 } elseif($char === '=') { 592 /* U+003D EQUALS SIGN (=) 593 Switch to the before attribute value state. */ 594 $this->state = 'beforeAttributeValue'; 595 596 } elseif($char === '>') { 597 /* U+003E GREATER-THAN SIGN (>) 598 Emit the current tag token. Switch to the data state. */ 599 $this->emitToken($this->token); 600 $this->state = 'data'; 601 602 } elseif($char === '/' && $this->character($this->char + 1) !== '>') { 603 /* U+002F SOLIDUS (/) 604 Parse error unless this is a permitted slash. Switch to the 605 before attribute name state. */ 606 $this->state = 'beforeAttributeName'; 607 608 } elseif($this->char === $this->EOF) { 609 /* EOF 610 Parse error. Emit the current tag token. Reconsume the EOF 611 character in the data state. */ 612 $this->emitToken($this->token); 613 614 $this->char--; 615 $this->state = 'data'; 616 617 } else { 618 /* Anything else 619 Start a new attribute in the current tag token. Set that attribute's 620 name to the current input character, and its value to the empty string. 621 Switch to the attribute name state. */ 622 $this->token['attr'][] = array( 623 'name' => strtolower($char), 624 'value' => null 625 ); 626 627 $this->state = 'attributeName'; 628 } 629 } 630 631 private function beforeAttributeValueState() { 632 // Consume the next input character: 633 $this->char++; 634 $char = $this->character($this->char); 635 636 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 637 /* U+0009 CHARACTER TABULATION 638 U+000A LINE FEED (LF) 639 U+000B LINE TABULATION 640 U+000C FORM FEED (FF) 641 U+0020 SPACE 642 Stay in the before attribute value state. */ 643 $this->state = 'beforeAttributeValue'; 644 645 } elseif($char === '"') { 646 /* U+0022 QUOTATION MARK (") 647 Switch to the attribute value (double-quoted) state. */ 648 $this->state = 'attributeValueDoubleQuoted'; 649 650 } elseif($char === '&') { 651 /* U+0026 AMPERSAND (&) 652 Switch to the attribute value (unquoted) state and reconsume 653 this input character. */ 654 $this->char--; 655 $this->state = 'attributeValueUnquoted'; 656 657 } elseif($char === '\'') { 658 /* U+0027 APOSTROPHE (') 659 Switch to the attribute value (single-quoted) state. */ 660 $this->state = 'attributeValueSingleQuoted'; 661 662 } elseif($char === '>') { 663 /* U+003E GREATER-THAN SIGN (>) 664 Emit the current tag token. Switch to the data state. */ 665 $this->emitToken($this->token); 666 $this->state = 'data'; 667 668 } else { 669 /* Anything else 670 Append the current input character to the current attribute's value. 671 Switch to the attribute value (unquoted) state. */ 672 $last = count($this->token['attr']) - 1; 673 $this->token['attr'][$last]['value'] .= $char; 674 675 $this->state = 'attributeValueUnquoted'; 676 } 677 } 678 679 private function attributeValueDoubleQuotedState() { 680 // Consume the next input character: 681 $this->char++; 682 $char = $this->character($this->char); 683 684 if($char === '"') { 685 /* U+0022 QUOTATION MARK (") 686 Switch to the before attribute name state. */ 687 $this->state = 'beforeAttributeName'; 688 689 } elseif($char === '&') { 690 /* U+0026 AMPERSAND (&) 691 Switch to the entity in attribute value state. */ 692 $this->entityInAttributeValueState('double'); 693 694 } elseif($this->char === $this->EOF) { 695 /* EOF 696 Parse error. Emit the current tag token. Reconsume the character 697 in the data state. */ 698 $this->emitToken($this->token); 699 700 $this->char--; 701 $this->state = 'data'; 702 703 } else { 704 /* Anything else 705 Append the current input character to the current attribute's value. 706 Stay in the attribute value (double-quoted) state. */ 707 $last = count($this->token['attr']) - 1; 708 $this->token['attr'][$last]['value'] .= $char; 709 710 $this->state = 'attributeValueDoubleQuoted'; 711 } 712 } 713 714 private function attributeValueSingleQuotedState() { 715 // Consume the next input character: 716 $this->char++; 717 $char = $this->character($this->char); 718 719 if($char === '\'') { 720 /* U+0022 QUOTATION MARK (') 721 Switch to the before attribute name state. */ 722 $this->state = 'beforeAttributeName'; 723 724 } elseif($char === '&') { 725 /* U+0026 AMPERSAND (&) 726 Switch to the entity in attribute value state. */ 727 $this->entityInAttributeValueState('single'); 728 729 } elseif($this->char === $this->EOF) { 730 /* EOF 731 Parse error. Emit the current tag token. Reconsume the character 732 in the data state. */ 733 $this->emitToken($this->token); 734 735 $this->char--; 736 $this->state = 'data'; 737 738 } else { 739 /* Anything else 740 Append the current input character to the current attribute's value. 741 Stay in the attribute value (single-quoted) state. */ 742 $last = count($this->token['attr']) - 1; 743 $this->token['attr'][$last]['value'] .= $char; 744 745 $this->state = 'attributeValueSingleQuoted'; 746 } 747 } 748 749 private function attributeValueUnquotedState() { 750 // Consume the next input character: 751 $this->char++; 752 $char = $this->character($this->char); 753 754 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 755 /* U+0009 CHARACTER TABULATION 756 U+000A LINE FEED (LF) 757 U+000B LINE TABULATION 758 U+000C FORM FEED (FF) 759 U+0020 SPACE 760 Switch to the before attribute name state. */ 761 $this->state = 'beforeAttributeName'; 762 763 } elseif($char === '&') { 764 /* U+0026 AMPERSAND (&) 765 Switch to the entity in attribute value state. */ 766 $this->entityInAttributeValueState(); 767 768 } elseif($char === '>') { 769 /* U+003E GREATER-THAN SIGN (>) 770 Emit the current tag token. Switch to the data state. */ 771 $this->emitToken($this->token); 772 $this->state = 'data'; 773 774 } else { 775 /* Anything else 776 Append the current input character to the current attribute's value. 777 Stay in the attribute value (unquoted) state. */ 778 $last = count($this->token['attr']) - 1; 779 $this->token['attr'][$last]['value'] .= $char; 780 781 $this->state = 'attributeValueUnquoted'; 782 } 783 } 784 785 private function entityInAttributeValueState() { 786 // Attempt to consume an entity. 787 $entity = $this->entity(); 788 789 // If nothing is returned, append a U+0026 AMPERSAND character to the 790 // current attribute's value. Otherwise, emit the character token that 791 // was returned. 792 $char = (!$entity) 793 ? '&' 794 : $entity; 795 796 $last = count($this->token['attr']) - 1; 797 $this->token['attr'][$last]['value'] .= $char; 798 } 799 800 private function bogusCommentState() { 801 /* Consume every character up to the first U+003E GREATER-THAN SIGN 802 character (>) or the end of the file (EOF), whichever comes first. Emit 803 a comment token whose data is the concatenation of all the characters 804 starting from and including the character that caused the state machine 805 to switch into the bogus comment state, up to and including the last 806 consumed character before the U+003E character, if any, or up to the 807 end of the file otherwise. (If the comment was started by the end of 808 the file (EOF), the token is empty.) */ 809 $data = $this->characters('^>', $this->char); 810 $this->emitToken(array( 811 'data' => $data, 812 'type' => self::COMMENT 813 )); 814 815 $this->char += strlen($data); 816 817 /* Switch to the data state. */ 818 $this->state = 'data'; 819 820 /* If the end of the file was reached, reconsume the EOF character. */ 821 if($this->char === $this->EOF) { 822 $this->char = $this->EOF - 1; 823 } 824 } 825 826 private function markupDeclarationOpenState() { 827 /* If the next two characters are both U+002D HYPHEN-MINUS (-) 828 characters, consume those two characters, create a comment token whose 829 data is the empty string, and switch to the comment state. */ 830 if($this->character($this->char + 1, 2) === '--') { 831 $this->char += 2; 832 $this->state = 'comment'; 833 $this->token = array( 834 'data' => null, 835 'type' => self::COMMENT 836 ); 837 838 /* Otherwise if the next seven chacacters are a case-insensitive match 839 for the word "DOCTYPE", then consume those characters and switch to the 840 DOCTYPE state. */ 841 } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') { 842 $this->char += 7; 843 $this->state = 'doctype'; 844 845 /* Otherwise, is is a parse error. Switch to the bogus comment state. 846 The next character that is consumed, if any, is the first character 847 that will be in the comment. */ 848 } else { 849 $this->char++; 850 $this->state = 'bogusComment'; 851 } 852 } 853 854 private function commentState() { 855 /* Consume the next input character: */ 856 $this->char++; 857 $char = $this->char(); 858 859 /* U+002D HYPHEN-MINUS (-) */ 860 if($char === '-') { 861 /* Switch to the comment dash state */ 862 $this->state = 'commentDash'; 863 864 /* EOF */ 865 } elseif($this->char === $this->EOF) { 866 /* Parse error. Emit the comment token. Reconsume the EOF character 867 in the data state. */ 868 $this->emitToken($this->token); 869 $this->char--; 870 $this->state = 'data'; 871 872 /* Anything else */ 873 } else { 874 /* Append the input character to the comment token's data. Stay in 875 the comment state. */ 876 $this->token['data'] .= $char; 877 } 878 } 879 880 private function commentDashState() { 881 /* Consume the next input character: */ 882 $this->char++; 883 $char = $this->char(); 884 885 /* U+002D HYPHEN-MINUS (-) */ 886 if($char === '-') { 887 /* Switch to the comment end state */ 888 $this->state = 'commentEnd'; 889 890 /* EOF */ 891 } elseif($this->char === $this->EOF) { 892 /* Parse error. Emit the comment token. Reconsume the EOF character 893 in the data state. */ 894 $this->emitToken($this->token); 895 $this->char--; 896 $this->state = 'data'; 897 898 /* Anything else */ 899 } else { 900 /* Append a U+002D HYPHEN-MINUS (-) character and the input 901 character to the comment token's data. Switch to the comment state. */ 902 $this->token['data'] .= '-'.$char; 903 $this->state = 'comment'; 904 } 905 } 906 907 private function commentEndState() { 908 /* Consume the next input character: */ 909 $this->char++; 910 $char = $this->char(); 911 912 if($char === '>') { 913 $this->emitToken($this->token); 914 $this->state = 'data'; 915 916 } elseif($char === '-') { 917 $this->token['data'] .= '-'; 918 919 } elseif($this->char === $this->EOF) { 920 $this->emitToken($this->token); 921 $this->char--; 922 $this->state = 'data'; 923 924 } else { 925 $this->token['data'] .= '--'.$char; 926 $this->state = 'comment'; 927 } 928 } 929 930 private function doctypeState() { 931 /* Consume the next input character: */ 932 $this->char++; 933 $char = $this->char(); 934 935 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 936 $this->state = 'beforeDoctypeName'; 937 938 } else { 939 $this->char--; 940 $this->state = 'beforeDoctypeName'; 941 } 942 } 943 944 private function beforeDoctypeNameState() { 945 /* Consume the next input character: */ 946 $this->char++; 947 $char = $this->char(); 948 949 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 950 // Stay in the before DOCTYPE name state. 951 952 } elseif(preg_match('/^[a-z]$/', $char)) { 953 $this->token = array( 954 'name' => strtoupper($char), 955 'type' => self::DOCTYPE, 956 'error' => true 957 ); 958 959 $this->state = 'doctypeName'; 960 961 } elseif($char === '>') { 962 $this->emitToken(array( 963 'name' => null, 964 'type' => self::DOCTYPE, 965 'error' => true 966 )); 967 968 $this->state = 'data'; 969 970 } elseif($this->char === $this->EOF) { 971 $this->emitToken(array( 972 'name' => null, 973 'type' => self::DOCTYPE, 974 'error' => true 975 )); 976 977 $this->char--; 978 $this->state = 'data'; 979 980 } else { 981 $this->token = array( 982 'name' => $char, 983 'type' => self::DOCTYPE, 984 'error' => true 985 ); 986 987 $this->state = 'doctypeName'; 988 } 989 } 990 991 private function doctypeNameState() { 992 /* Consume the next input character: */ 993 $this->char++; 994 $char = $this->char(); 995 996 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 997 $this->state = 'AfterDoctypeName'; 998 999 } elseif($char === '>') { 1000 $this->emitToken($this->token); 1001 $this->state = 'data'; 1002 1003 } elseif(preg_match('/^[a-z]$/', $char)) { 1004 $this->token['name'] .= strtoupper($char); 1005 1006 } elseif($this->char === $this->EOF) { 1007 $this->emitToken($this->token); 1008 $this->char--; 1009 $this->state = 'data'; 1010 1011 } else { 1012 $this->token['name'] .= $char; 1013 } 1014 1015 $this->token['error'] = ($this->token['name'] === 'HTML') 1016 ? false 1017 : true; 1018 } 1019 1020 private function afterDoctypeNameState() { 1021 /* Consume the next input character: */ 1022 $this->char++; 1023 $char = $this->char(); 1024 1025 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 1026 // Stay in the DOCTYPE name state. 1027 1028 } elseif($char === '>') { 1029 $this->emitToken($this->token); 1030 $this->state = 'data'; 1031 1032 } elseif($this->char === $this->EOF) { 1033 $this->emitToken($this->token); 1034 $this->char--; 1035 $this->state = 'data'; 1036 1037 } else { 1038 $this->token['error'] = true; 1039 $this->state = 'bogusDoctype'; 1040 } 1041 } 1042 1043 private function bogusDoctypeState() { 1044 /* Consume the next input character: */ 1045 $this->char++; 1046 $char = $this->char(); 1047 1048 if($char === '>') { 1049 $this->emitToken($this->token); 1050 $this->state = 'data'; 1051 1052 } elseif($this->char === $this->EOF) { 1053 $this->emitToken($this->token); 1054 $this->char--; 1055 $this->state = 'data'; 1056 1057 } else { 1058 // Stay in the bogus DOCTYPE state. 1059 } 1060 } 1061 1062 private function entity() { 1063 $start = $this->char; 1064 1065 // This section defines how to consume an entity. This definition is 1066 // used when parsing entities in text and in attributes. 1067 1068 // The behaviour depends on the identity of the next character (the 1069 // one immediately after the U+0026 AMPERSAND character): 1070 1071 switch($this->character($this->char + 1)) { 1072 // U+0023 NUMBER SIGN (#) 1073 case '#': 1074 1075 // The behaviour further depends on the character after the 1076 // U+0023 NUMBER SIGN: 1077 switch($this->character($this->char + 1)) { 1078 // U+0078 LATIN SMALL LETTER X 1079 // U+0058 LATIN CAPITAL LETTER X 1080 case 'x': 1081 case 'X': 1082 // Follow the steps below, but using the range of 1083 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT 1084 // NINE, U+0061 LATIN SMALL LETTER A through to U+0066 1085 // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER 1086 // A, through to U+0046 LATIN CAPITAL LETTER F (in other 1087 // words, 0-9, A-F, a-f). 1088 $char = 1; 1089 $char_class = '0-9A-Fa-f'; 1090 break; 1091 1092 // Anything else 1093 default: 1094 // Follow the steps below, but using the range of 1095 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT 1096 // NINE (i.e. just 0-9). 1097 $char = 0; 1098 $char_class = '0-9'; 1099 break; 1100 } 1101 1102 // Consume as many characters as match the range of characters 1103 // given above. 1104 $this->char++; 1105 $e_name = $this->characters($char_class, $this->char + $char + 1); 1106 $entity = $this->character($start, $this->char); 1107 $cond = strlen($e_name) > 0; 1108 1109 // The rest of the parsing happens bellow. 1110 break; 1111 1112 // Anything else 1113 default: 1114 // Consume the maximum number of characters possible, with the 1115 // consumed characters case-sensitively matching one of the 1116 // identifiers in the first column of the entities table. 1117 $e_name = $this->characters('0-9A-Za-z;', $this->char + 1); 1118 $len = strlen($e_name); 1119 1120 for($c = 1; $c <= $len; $c++) { 1121 $id = substr($e_name, 0, $c); 1122 $this->char++; 1123 1124 if(in_array($id, $this->entities)) { 1125 if ($e_name[$c-1] !== ';') { 1126 if ($c < $len && $e_name[$c] == ';') { 1127 $this->char++; // consume extra semicolon 1128 } 1129 } 1130 $entity = $id; 1131 break; 1132 } 1133 } 1134 1135 $cond = isset($entity); 1136 // The rest of the parsing happens bellow. 1137 break; 1138 } 1139 1140 if(!$cond) { 1141 // If no match can be made, then this is a parse error. No 1142 // characters are consumed, and nothing is returned. 1143 $this->char = $start; 1144 return false; 1145 } 1146 1147 // Return a character token for the character corresponding to the 1148 // entity name (as given by the second column of the entities table). 1149 return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8'); 1150 } 1151 1152 private function emitToken($token) { 1153 $emit = $this->tree->emitToken($token); 1154 1155 if(is_int($emit)) { 1156 $this->content_model = $emit; 1157 1158 } elseif($token['type'] === self::ENDTAG) { 1159 $this->content_model = self::PCDATA; 1160 } 1161 } 1162 1163 private function EOF() { 1164 $this->state = null; 1165 $this->tree->emitToken(array( 1166 'type' => self::EOF 1167 )); 1168 } 1169 } 1170 1171 class HTML5TreeConstructer { 1172 public $stack = array(); 1173 1174 private $phase; 1175 private $mode; 1176 private $dom; 1177 private $foster_parent = null; 1178 private $a_formatting = array(); 1179 1180 private $head_pointer = null; 1181 private $form_pointer = null; 1182 1183 private $scoping = array('button','caption','html','marquee','object','table','td','th'); 1184 private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u'); 1185 private $special = array('address','area','base','basefont','bgsound', 1186 'blockquote','body','br','center','col','colgroup','dd','dir','div','dl', 1187 'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5', 1188 'h6','head','hr','iframe','image','img','input','isindex','li','link', 1189 'listing','menu','meta','noembed','noframes','noscript','ol','optgroup', 1190 'option','p','param','plaintext','pre','script','select','spacer','style', 1191 'tbody','textarea','tfoot','thead','title','tr','ul','wbr'); 1192 1193 // The different phases. 1194 const INIT_PHASE = 0; 1195 const ROOT_PHASE = 1; 1196 const MAIN_PHASE = 2; 1197 const END_PHASE = 3; 1198 1199 // The different insertion modes for the main phase. 1200 const BEFOR_HEAD = 0; 1201 const IN_HEAD = 1; 1202 const AFTER_HEAD = 2; 1203 const IN_BODY = 3; 1204 const IN_TABLE = 4; 1205 const IN_CAPTION = 5; 1206 const IN_CGROUP = 6; 1207 const IN_TBODY = 7; 1208 const IN_ROW = 8; 1209 const IN_CELL = 9; 1210 const IN_SELECT = 10; 1211 const AFTER_BODY = 11; 1212 const IN_FRAME = 12; 1213 const AFTR_FRAME = 13; 1214 1215 // The different types of elements. 1216 const SPECIAL = 0; 1217 const SCOPING = 1; 1218 const FORMATTING = 2; 1219 const PHRASING = 3; 1220 1221 const MARKER = 0; 1222 1223 public function __construct() { 1224 $this->phase = self::INIT_PHASE; 1225 $this->mode = self::BEFOR_HEAD; 1226 $this->dom = new DOMDocument; 1227 1228 $this->dom->encoding = 'UTF-8'; 1229 $this->dom->preserveWhiteSpace = true; 1230 $this->dom->substituteEntities = true; 1231 $this->dom->strictErrorChecking = false; 1232 } 1233 1234 // Process tag tokens 1235 public function emitToken($token) { 1236 switch($this->phase) { 1237 case self::INIT_PHASE: return $this->initPhase($token); break; 1238 case self::ROOT_PHASE: return $this->rootElementPhase($token); break; 1239 case self::MAIN_PHASE: return $this->mainPhase($token); break; 1240 case self::END_PHASE : return $this->trailingEndPhase($token); break; 1241 } 1242 } 1243 1244 private function initPhase($token) { 1245 /* Initially, the tree construction stage must handle each token 1246 emitted from the tokenisation stage as follows: */ 1247 1248 /* A DOCTYPE token that is marked as being in error 1249 A comment token 1250 A start tag token 1251 An end tag token 1252 A character token that is not one of one of U+0009 CHARACTER TABULATION, 1253 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1254 or U+0020 SPACE 1255 An end-of-file token */ 1256 if((isset($token['error']) && $token['error']) || 1257 $token['type'] === HTML5::COMMENT || 1258 $token['type'] === HTML5::STARTTAG || 1259 $token['type'] === HTML5::ENDTAG || 1260 $token['type'] === HTML5::EOF || 1261 ($token['type'] === HTML5::CHARACTR && isset($token['data']) && 1262 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) { 1263 /* This specification does not define how to handle this case. In 1264 particular, user agents may ignore the entirety of this specification 1265 altogether for such documents, and instead invoke special parse modes 1266 with a greater emphasis on backwards compatibility. */ 1267 1268 $this->phase = self::ROOT_PHASE; 1269 return $this->rootElementPhase($token); 1270 1271 /* A DOCTYPE token marked as being correct */ 1272 } elseif(isset($token['error']) && !$token['error']) { 1273 /* Append a DocumentType node to the Document node, with the name 1274 attribute set to the name given in the DOCTYPE token (which will be 1275 "HTML"), and the other attributes specific to DocumentType objects 1276 set to null, empty lists, or the empty string as appropriate. */ 1277 $doctype = new DOMDocumentType(null, null, 'HTML'); 1278 1279 /* Then, switch to the root element phase of the tree construction 1280 stage. */ 1281 $this->phase = self::ROOT_PHASE; 1282 1283 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1284 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1285 or U+0020 SPACE */ 1286 } elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/', 1287 $token['data'])) { 1288 /* Append that character to the Document node. */ 1289 $text = $this->dom->createTextNode($token['data']); 1290 $this->dom->appendChild($text); 1291 } 1292 } 1293 1294 private function rootElementPhase($token) { 1295 /* After the initial phase, as each token is emitted from the tokenisation 1296 stage, it must be processed as described in this section. */ 1297 1298 /* A DOCTYPE token */ 1299 if($token['type'] === HTML5::DOCTYPE) { 1300 // Parse error. Ignore the token. 1301 1302 /* A comment token */ 1303 } elseif($token['type'] === HTML5::COMMENT) { 1304 /* Append a Comment node to the Document object with the data 1305 attribute set to the data given in the comment token. */ 1306 $comment = $this->dom->createComment($token['data']); 1307 $this->dom->appendChild($comment); 1308 1309 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1310 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1311 or U+0020 SPACE */ 1312 } elseif($token['type'] === HTML5::CHARACTR && 1313 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 1314 /* Append that character to the Document node. */ 1315 $text = $this->dom->createTextNode($token['data']); 1316 $this->dom->appendChild($text); 1317 1318 /* A character token that is not one of U+0009 CHARACTER TABULATION, 1319 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED 1320 (FF), or U+0020 SPACE 1321 A start tag token 1322 An end tag token 1323 An end-of-file token */ 1324 } elseif(($token['type'] === HTML5::CHARACTR && 1325 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || 1326 $token['type'] === HTML5::STARTTAG || 1327 $token['type'] === HTML5::ENDTAG || 1328 $token['type'] === HTML5::EOF) { 1329 /* Create an HTMLElement node with the tag name html, in the HTML 1330 namespace. Append it to the Document object. Switch to the main 1331 phase and reprocess the current token. */ 1332 $html = $this->dom->createElement('html'); 1333 $this->dom->appendChild($html); 1334 $this->stack[] = $html; 1335 1336 $this->phase = self::MAIN_PHASE; 1337 return $this->mainPhase($token); 1338 } 1339 } 1340 1341 private function mainPhase($token) { 1342 /* Tokens in the main phase must be handled as follows: */ 1343 1344 /* A DOCTYPE token */ 1345 if($token['type'] === HTML5::DOCTYPE) { 1346 // Parse error. Ignore the token. 1347 1348 /* A start tag token with the tag name "html" */ 1349 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') { 1350 /* If this start tag token was not the first start tag token, then 1351 it is a parse error. */ 1352 1353 /* For each attribute on the token, check to see if the attribute 1354 is already present on the top element of the stack of open elements. 1355 If it is not, add the attribute and its corresponding value to that 1356 element. */ 1357 foreach($token['attr'] as $attr) { 1358 if(!$this->stack[0]->hasAttribute($attr['name'])) { 1359 $this->stack[0]->setAttribute($attr['name'], $attr['value']); 1360 } 1361 } 1362 1363 /* An end-of-file token */ 1364 } elseif($token['type'] === HTML5::EOF) { 1365 /* Generate implied end tags. */ 1366 $this->generateImpliedEndTags(); 1367 1368 /* Anything else. */ 1369 } else { 1370 /* Depends on the insertion mode: */ 1371 switch($this->mode) { 1372 case self::BEFOR_HEAD: return $this->beforeHead($token); break; 1373 case self::IN_HEAD: return $this->inHead($token); break; 1374 case self::AFTER_HEAD: return $this->afterHead($token); break; 1375 case self::IN_BODY: return $this->inBody($token); break; 1376 case self::IN_TABLE: return $this->inTable($token); break; 1377 case self::IN_CAPTION: return $this->inCaption($token); break; 1378 case self::IN_CGROUP: return $this->inColumnGroup($token); break; 1379 case self::IN_TBODY: return $this->inTableBody($token); break; 1380 case self::IN_ROW: return $this->inRow($token); break; 1381 case self::IN_CELL: return $this->inCell($token); break; 1382 case self::IN_SELECT: return $this->inSelect($token); break; 1383 case self::AFTER_BODY: return $this->afterBody($token); break; 1384 case self::IN_FRAME: return $this->inFrameset($token); break; 1385 case self::AFTR_FRAME: return $this->afterFrameset($token); break; 1386 case self::END_PHASE: return $this->trailingEndPhase($token); break; 1387 } 1388 } 1389 } 1390 1391 private function beforeHead($token) { 1392 /* Handle the token as follows: */ 1393 1394 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1395 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1396 or U+0020 SPACE */ 1397 if($token['type'] === HTML5::CHARACTR && 1398 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 1399 /* Append the character to the current node. */ 1400 $this->insertText($token['data']); 1401 1402 /* A comment token */ 1403 } elseif($token['type'] === HTML5::COMMENT) { 1404 /* Append a Comment node to the current node with the data attribute 1405 set to the data given in the comment token. */ 1406 $this->insertComment($token['data']); 1407 1408 /* A start tag token with the tag name "head" */ 1409 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') { 1410 /* Create an element for the token, append the new element to the 1411 current node and push it onto the stack of open elements. */ 1412 $element = $this->insertElement($token); 1413 1414 /* Set the head element pointer to this new element node. */ 1415 $this->head_pointer = $element; 1416 1417 /* Change the insertion mode to "in head". */ 1418 $this->mode = self::IN_HEAD; 1419 1420 /* A start tag token whose tag name is one of: "base", "link", "meta", 1421 "script", "style", "title". Or an end tag with the tag name "html". 1422 Or a character token that is not one of U+0009 CHARACTER TABULATION, 1423 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1424 or U+0020 SPACE. Or any other start tag token */ 1425 } elseif($token['type'] === HTML5::STARTTAG || 1426 ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') || 1427 ($token['type'] === HTML5::CHARACTR && !preg_match('/^[\t\n\x0b\x0c ]$/', 1428 $token['data']))) { 1429 /* Act as if a start tag token with the tag name "head" and no 1430 attributes had been seen, then reprocess the current token. */ 1431 $this->beforeHead(array( 1432 'name' => 'head', 1433 'type' => HTML5::STARTTAG, 1434 'attr' => array() 1435 )); 1436 1437 return $this->inHead($token); 1438 1439 /* Any other end tag */ 1440 } elseif($token['type'] === HTML5::ENDTAG) { 1441 /* Parse error. Ignore the token. */ 1442 } 1443 } 1444 1445 private function inHead($token) { 1446 /* Handle the token as follows: */ 1447 1448 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1449 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1450 or U+0020 SPACE. 1451 1452 THIS DIFFERS FROM THE SPEC: If the current node is either a title, style 1453 or script element, append the character to the current node regardless 1454 of its content. */ 1455 if(($token['type'] === HTML5::CHARACTR && 1456 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || ( 1457 $token['type'] === HTML5::CHARACTR && in_array(end($this->stack)->nodeName, 1458 array('title', 'style', 'script')))) { 1459 /* Append the character to the current node. */ 1460 $this->insertText($token['data']); 1461 1462 /* A comment token */ 1463 } elseif($token['type'] === HTML5::COMMENT) { 1464 /* Append a Comment node to the current node with the data attribute 1465 set to the data given in the comment token. */ 1466 $this->insertComment($token['data']); 1467 1468 } elseif($token['type'] === HTML5::ENDTAG && 1469 in_array($token['name'], array('title', 'style', 'script'))) { 1470 array_pop($this->stack); 1471 return HTML5::PCDATA; 1472 1473 /* A start tag with the tag name "title" */ 1474 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') { 1475 /* Create an element for the token and append the new element to the 1476 node pointed to by the head element pointer, or, if that is null 1477 (innerHTML case), to the current node. */ 1478 if($this->head_pointer !== null) { 1479 $element = $this->insertElement($token, false); 1480 $this->head_pointer->appendChild($element); 1481 1482 } else { 1483 $element = $this->insertElement($token); 1484 } 1485 1486 /* Switch the tokeniser's content model flag to the RCDATA state. */ 1487 return HTML5::RCDATA; 1488 1489 /* A start tag with the tag name "style" */ 1490 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') { 1491 /* Create an element for the token and append the new element to the 1492 node pointed to by the head element pointer, or, if that is null 1493 (innerHTML case), to the current node. */ 1494 if($this->head_pointer !== null) { 1495 $element = $this->insertElement($token, false); 1496 $this->head_pointer->appendChild($element); 1497 1498 } else { 1499 $this->insertElement($token); 1500 } 1501 1502 /* Switch the tokeniser's content model flag to the CDATA state. */ 1503 return HTML5::CDATA; 1504 1505 /* A start tag with the tag name "script" */ 1506 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') { 1507 /* Create an element for the token. */ 1508 $element = $this->insertElement($token, false); 1509 $this->head_pointer->appendChild($element); 1510 1511 /* Switch the tokeniser's content model flag to the CDATA state. */ 1512 return HTML5::CDATA; 1513 1514 /* A start tag with the tag name "base", "link", or "meta" */ 1515 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], 1516 array('base', 'link', 'meta'))) { 1517 /* Create an element for the token and append the new element to the 1518 node pointed to by the head element pointer, or, if that is null 1519 (innerHTML case), to the current node. */ 1520 if($this->head_pointer !== null) { 1521 $element = $this->insertElement($token, false); 1522 $this->head_pointer->appendChild($element); 1523 array_pop($this->stack); 1524 1525 } else { 1526 $this->insertElement($token); 1527 } 1528 1529 /* An end tag with the tag name "head" */ 1530 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') { 1531 /* If the current node is a head element, pop the current node off 1532 the stack of open elements. */ 1533 if($this->head_pointer->isSameNode(end($this->stack))) { 1534 array_pop($this->stack); 1535 1536 /* Otherwise, this is a parse error. */ 1537 } else { 1538 // k 1539 } 1540 1541 /* Change the insertion mode to "after head". */ 1542 $this->mode = self::AFTER_HEAD; 1543 1544 /* A start tag with the tag name "head" or an end tag except "html". */ 1545 } elseif(($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') || 1546 ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')) { 1547 // Parse error. Ignore the token. 1548 1549 /* Anything else */ 1550 } else { 1551 /* If the current node is a head element, act as if an end tag 1552 token with the tag name "head" had been seen. */ 1553 if($this->head_pointer->isSameNode(end($this->stack))) { 1554 $this->inHead(array( 1555 'name' => 'head', 1556 'type' => HTML5::ENDTAG 1557 )); 1558 1559 /* Otherwise, change the insertion mode to "after head". */ 1560 } else { 1561 $this->mode = self::AFTER_HEAD; 1562 } 1563 1564 /* Then, reprocess the current token. */ 1565 return $this->afterHead($token); 1566 } 1567 } 1568 1569 private function afterHead($token) { 1570 /* Handle the token as follows: */ 1571 1572 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1573 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1574 or U+0020 SPACE */ 1575 if($token['type'] === HTML5::CHARACTR && 1576 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 1577 /* Append the character to the current node. */ 1578 $this->insertText($token['data']); 1579 1580 /* A comment token */ 1581 } elseif($token['type'] === HTML5::COMMENT) { 1582 /* Append a Comment node to the current node with the data attribute 1583 set to the data given in the comment token. */ 1584 $this->insertComment($token['data']); 1585 1586 /* A start tag token with the tag name "body" */ 1587 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') { 1588 /* Insert a body element for the token. */ 1589 $this->insertElement($token); 1590 1591 /* Change the insertion mode to "in body". */ 1592 $this->mode = self::IN_BODY; 1593 1594 /* A start tag token with the tag name "frameset" */ 1595 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') { 1596 /* Insert a frameset element for the token. */ 1597 $this->insertElement($token); 1598 1599 /* Change the insertion mode to "in frameset". */ 1600 $this->mode = self::IN_FRAME; 1601 1602 /* A start tag token whose tag name is one of: "base", "link", "meta", 1603 "script", "style", "title" */ 1604 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], 1605 array('base', 'link', 'meta', 'script', 'style', 'title'))) { 1606 /* Parse error. Switch the insertion mode back to "in head" and 1607 reprocess the token. */ 1608 $this->mode = self::IN_HEAD; 1609 return $this->inHead($token); 1610 1611 /* Anything else */ 1612 } else { 1613 /* Act as if a start tag token with the tag name "body" and no 1614 attributes had been seen, and then reprocess the current token. */ 1615 $this->afterHead(array( 1616 'name' => 'body', 1617 'type' => HTML5::STARTTAG, 1618 'attr' => array() 1619 )); 1620 1621 return $this->inBody($token); 1622 } 1623 } 1624 1625 private function inBody($token) { 1626 /* Handle the token as follows: */ 1627 1628 switch($token['type']) { 1629 /* A character token */ 1630 case HTML5::CHARACTR: 1631 /* Reconstruct the active formatting elements, if any. */ 1632 $this->reconstructActiveFormattingElements(); 1633 1634 /* Append the token's character to the current node. */ 1635 $this->insertText($token['data']); 1636 break; 1637 1638 /* A comment token */ 1639 case HTML5::COMMENT: 1640 /* Append a Comment node to the current node with the data 1641 attribute set to the data given in the comment token. */ 1642 $this->insertComment($token['data']); 1643 break; 1644 1645 case HTML5::STARTTAG: 1646 switch($token['name']) { 1647 /* A start tag token whose tag name is one of: "script", 1648 "style" */ 1649 case 'script': case 'style': 1650 /* Process the token as if the insertion mode had been "in 1651 head". */ 1652 return $this->inHead($token); 1653 break; 1654 1655 /* A start tag token whose tag name is one of: "base", "link", 1656 "meta", "title" */ 1657 case 'base': case 'link': case 'meta': case 'title': 1658 /* Parse error. Process the token as if the insertion mode 1659 had been "in head". */ 1660 return $this->inHead($token); 1661 break; 1662 1663 /* A start tag token with the tag name "body" */ 1664 case 'body': 1665 /* Parse error. If the second element on the stack of open 1666 elements is not a body element, or, if the stack of open 1667 elements has only one node on it, then ignore the token. 1668 (innerHTML case) */ 1669 if(count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') { 1670 // Ignore 1671 1672 /* Otherwise, for each attribute on the token, check to see 1673 if the attribute is already present on the body element (the 1674 second element) on the stack of open elements. If it is not, 1675 add the attribute and its corresponding value to that 1676 element. */ 1677 } else { 1678 foreach($token['attr'] as $attr) { 1679 if(!$this->stack[1]->hasAttribute($attr['name'])) { 1680 $this->stack[1]->setAttribute($attr['name'], $attr['value']); 1681 } 1682 } 1683 } 1684 break; 1685 1686 /* A start tag whose tag name is one of: "address", 1687 "blockquote", "center", "dir", "div", "dl", "fieldset", 1688 "listing", "menu", "ol", "p", "ul" */ 1689 case 'address': case 'blockquote': case 'center': case 'dir': 1690 case 'div': case 'dl': case 'fieldset': case 'listing': 1691 case 'menu': case 'ol': case 'p': case 'ul': 1692 /* If the stack of open elements has a p element in scope, 1693 then act as if an end tag with the tag name p had been 1694 seen. */ 1695 if($this->elementInScope('p')) { 1696 $this->emitToken(array( 1697 'name' => 'p', 1698 'type' => HTML5::ENDTAG 1699 )); 1700 } 1701 1702 /* Insert an HTML element for the token. */ 1703 $this->insertElement($token); 1704 break; 1705 1706 /* A start tag whose tag name is "form" */ 1707 case 'form': 1708 /* If the form element pointer is not null, ignore the 1709 token with a parse error. */ 1710 if($this->form_pointer !== null) { 1711 // Ignore. 1712 1713 /* Otherwise: */ 1714 } else { 1715 /* If the stack of open elements has a p element in 1716 scope, then act as if an end tag with the tag name p 1717 had been seen. */ 1718 if($this->elementInScope('p')) { 1719 $this->emitToken(array( 1720 'name' => 'p', 1721 'type' => HTML5::ENDTAG 1722 )); 1723 } 1724 1725 /* Insert an HTML element for the token, and set the 1726 form element pointer to point to the element created. */ 1727 $element = $this->insertElement($token); 1728 $this->form_pointer = $element; 1729 } 1730 break; 1731 1732 /* A start tag whose tag name is "li", "dd" or "dt" */ 1733 case 'li': case 'dd': case 'dt': 1734 /* If the stack of open elements has a p element in scope, 1735 then act as if an end tag with the tag name p had been 1736 seen. */ 1737 if($this->elementInScope('p')) { 1738 $this->emitToken(array( 1739 'name' => 'p', 1740 'type' => HTML5::ENDTAG 1741 )); 1742 } 1743 1744 $stack_length = count($this->stack) - 1; 1745 1746 for($n = $stack_length; 0 <= $n; $n--) { 1747 /* 1. Initialise node to be the current node (the 1748 bottommost node of the stack). */ 1749 $stop = false; 1750 $node = $this->stack[$n]; 1751 $cat = $this->getElementCategory($node->tagName); 1752 1753 /* 2. If node is an li, dd or dt element, then pop all 1754 the nodes from the current node up to node, including 1755 node, then stop this algorithm. */ 1756 if($token['name'] === $node->tagName || ($token['name'] !== 'li' 1757 && ($node->tagName === 'dd' || $node->tagName === 'dt'))) { 1758 for($x = $stack_length; $x >= $n ; $x--) { 1759 array_pop($this->stack); 1760 } 1761 1762 break; 1763 } 1764 1765 /* 3. If node is not in the formatting category, and is 1766 not in the phrasing category, and is not an address or 1767 div element, then stop this algorithm. */ 1768 if($cat !== self::FORMATTING && $cat !== self::PHRASING && 1769 $node->tagName !== 'address' && $node->tagName !== 'div') { 1770 break; 1771 } 1772 } 1773 1774 /* Finally, insert an HTML element with the same tag 1775 name as the token's. */ 1776 $this->insertElement($token); 1777 break; 1778 1779 /* A start tag token whose tag name is "plaintext" */ 1780 case 'plaintext': 1781 /* If the stack of open elements has a p element in scope, 1782 then act as if an end tag with the tag name p had been 1783 seen. */ 1784 if($this->elementInScope('p')) { 1785 $this->emitToken(array( 1786 'name' => 'p', 1787 'type' => HTML5::ENDTAG 1788 )); 1789 } 1790 1791 /* Insert an HTML element for the token. */ 1792 $this->insertElement($token); 1793 1794 return HTML5::PLAINTEXT; 1795 break; 1796 1797 /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4", 1798 "h5", "h6" */ 1799 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': 1800 /* If the stack of open elements has a p element in scope, 1801 then act as if an end tag with the tag name p had been seen. */ 1802 if($this->elementInScope('p')) { 1803 $this->emitToken(array( 1804 'name' => 'p', 1805 'type' => HTML5::ENDTAG 1806 )); 1807 } 1808 1809 /* If the stack of open elements has in scope an element whose 1810 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then 1811 this is a parse error; pop elements from the stack until an 1812 element with one of those tag names has been popped from the 1813 stack. */ 1814 while($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) { 1815 array_pop($this->stack); 1816 } 1817 1818 /* Insert an HTML element for the token. */ 1819 $this->insertElement($token); 1820 break; 1821 1822 /* A start tag whose tag name is "a" */ 1823 case 'a': 1824 /* If the list of active formatting elements contains 1825 an element whose tag name is "a" between the end of the 1826 list and the last marker on the list (or the start of 1827 the list if there is no marker on the list), then this 1828 is a parse error; act as if an end tag with the tag name 1829 "a" had been seen, then remove that element from the list 1830 of active formatting elements and the stack of open 1831 elements if the end tag didn't already remove it (it 1832 might not have if the element is not in table scope). */ 1833 $leng = count($this->a_formatting); 1834 1835 for($n = $leng - 1; $n >= 0; $n--) { 1836 if($this->a_formatting[$n] === self::MARKER) { 1837 break; 1838 1839 } elseif($this->a_formatting[$n]->nodeName === 'a') { 1840 $this->emitToken(array( 1841 'name' => 'a', 1842 'type' => HTML5::ENDTAG 1843 )); 1844 break; 1845 } 1846 } 1847 1848 /* Reconstruct the active formatting elements, if any. */ 1849 $this->reconstructActiveFormattingElements(); 1850 1851 /* Insert an HTML element for the token. */ 1852 $el = $this->insertElement($token); 1853 1854 /* Add that element to the list of active formatting 1855 elements. */ 1856 $this->a_formatting[] = $el; 1857 break; 1858 1859 /* A start tag whose tag name is one of: "b", "big", "em", "font", 1860 "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */ 1861 case 'b': case 'big': case 'em': case 'font': case 'i': 1862 case 'nobr': case 's': case 'small': case 'strike': 1863 case 'strong': case 'tt': case 'u': 1864 /* Reconstruct the active formatting elements, if any. */ 1865 $this->reconstructActiveFormattingElements(); 1866 1867 /* Insert an HTML element for the token. */ 1868 $el = $this->insertElement($token); 1869 1870 /* Add that element to the list of active formatting 1871 elements. */ 1872 $this->a_formatting[] = $el; 1873 break; 1874 1875 /* A start tag token whose tag name is "button" */ 1876 case 'button': 1877 /* If the stack of open elements has a button element in scope, 1878 then this is a parse error; act as if an end tag with the tag 1879 name "button" had been seen, then reprocess the token. (We don't 1880 do that. Unnecessary.) */ 1881 if($this->elementInScope('button')) { 1882 $this->inBody(array( 1883 'name' => 'button', 1884 'type' => HTML5::ENDTAG 1885 )); 1886 } 1887 1888 /* Reconstruct the active formatting elements, if any. */ 1889 $this->reconstructActiveFormattingElements(); 1890 1891 /* Insert an HTML element for the token. */ 1892 $this->insertElement($token); 1893 1894 /* Insert a marker at the end of the list of active 1895 formatting elements. */ 1896 $this->a_formatting[] = self::MARKER; 1897 break; 1898 1899 /* A start tag token whose tag name is one of: "marquee", "object" */ 1900 case 'marquee': case 'object': 1901 /* Reconstruct the active formatting elements, if any. */ 1902 $this->reconstructActiveFormattingElements(); 1903 1904 /* Insert an HTML element for the token. */ 1905 $this->insertElement($token); 1906 1907 /* Insert a marker at the end of the list of active 1908 formatting elements. */ 1909 $this->a_formatting[] = self::MARKER; 1910 break; 1911 1912 /* A start tag token whose tag name is "xmp" */ 1913 case 'xmp': 1914 /* Reconstruct the active formatting elements, if any. */ 1915 $this->reconstructActiveFormattingElements(); 1916 1917 /* Insert an HTML element for the token. */ 1918 $this->insertElement($token); 1919 1920 /* Switch the content model flag to the CDATA state. */ 1921 return HTML5::CDATA; 1922 break; 1923 1924 /* A start tag whose tag name is "table" */ 1925 case 'table': 1926 /* If the stack of open elements has a p element in scope, 1927 then act as if an end tag with the tag name p had been seen. */ 1928 if($this->elementInScope('p')) { 1929 $this->emitToken(array( 1930 'name' => 'p', 1931 'type' => HTML5::ENDTAG 1932 )); 1933 } 1934 1935 /* Insert an HTML element for the token. */ 1936 $this->insertElement($token); 1937 1938 /* Change the insertion mode to "in table". */ 1939 $this->mode = self::IN_TABLE; 1940 break; 1941 1942 /* A start tag whose tag name is one of: "area", "basefont", 1943 "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */ 1944 case 'area': case 'basefont': case 'bgsound': case 'br': 1945 case 'embed': case 'img': case 'param': case 'spacer': 1946 case 'wbr': 1947 /* Reconstruct the active formatting elements, if any. */ 1948 $this->reconstructActiveFormattingElements(); 1949 1950 /* Insert an HTML element for the token. */ 1951 $this->insertElement($token); 1952 1953 /* Immediately pop the current node off the stack of open elements. */ 1954 array_pop($this->stack); 1955 break; 1956 1957 /* A start tag whose tag name is "hr" */ 1958 case 'hr': 1959 /* If the stack of open elements has a p element in scope, 1960 then act as if an end tag with the tag name p had been seen. */ 1961 if($this->elementInScope('p')) { 1962 $this->emitToken(array( 1963 'name' => 'p', 1964 'type' => HTML5::ENDTAG 1965 )); 1966 } 1967 1968 /* Insert an HTML element for the token. */ 1969 $this->insertElement($token); 1970 1971 /* Immediately pop the current node off the stack of open elements. */ 1972 array_pop($this->stack); 1973 break; 1974 1975 /* A start tag whose tag name is "image" */ 1976 case 'image': 1977 /* Parse error. Change the token's tag name to "img" and 1978 reprocess it. (Don't ask.) */ 1979 $token['name'] = 'img'; 1980 return $this->inBody($token); 1981 break; 1982 1983 /* A start tag whose tag name is "input" */ 1984 case 'input': 1985 /* Reconstruct the active formatting elements, if any. */ 1986 $this->reconstructActiveFormattingElements(); 1987 1988 /* Insert an input element for the token. */ 1989 $element = $this->insertElement($token, false); 1990 1991 /* If the form element pointer is not null, then associate the 1992 input element with the form element pointed to by the form 1993 element pointer. */ 1994 $this->form_pointer !== null 1995 ? $this->form_pointer->appendChild($element) 1996 : end($this->stack)->appendChild($element); 1997 1998 /* Pop that input element off the stack of open elements. */ 1999 array_pop($this->stack); 2000 break; 2001 2002 /* A start tag whose tag name is "isindex" */ 2003 case 'isindex': 2004 /* Parse error. */ 2005 // w/e 2006 2007 /* If the form element pointer is not null, 2008 then ignore the token. */ 2009 if($this->form_pointer === null) { 2010 /* Act as if a start tag token with the tag name "form" had 2011 been seen. */ 2012 $this->inBody(array( 2013 'name' => 'body', 2014 'type' => HTML5::STARTTAG, 2015 'attr' => array() 2016 )); 2017 2018 /* Act as if a start tag token with the tag name "hr" had 2019 been seen. */ 2020 $this->inBody(array( 2021 'name' => 'hr', 2022 'type' => HTML5::STARTTAG, 2023 'attr' => array() 2024 )); 2025 2026 /* Act as if a start tag token with the tag name "p" had 2027 been seen. */ 2028 $this->inBody(array( 2029 'name' => 'p', 2030 'type' => HTML5::STARTTAG, 2031 'attr' => array() 2032 )); 2033 2034 /* Act as if a start tag token with the tag name "label" 2035 had been seen. */ 2036 $this->inBody(array( 2037 'name' => 'label', 2038 'type' => HTML5::STARTTAG, 2039 'attr' => array() 2040 )); 2041 2042 /* Act as if a stream of character tokens had been seen. */ 2043 $this->insertText('This is a searchable index. '. 2044 'Insert your search keywords here: '); 2045 2046 /* Act as if a start tag token with the tag name "input" 2047 had been seen, with all the attributes from the "isindex" 2048 token, except with the "name" attribute set to the value 2049 "isindex" (ignoring any explicit "name" attribute). */ 2050 $attr = $token['attr']; 2051 $attr[] = array('name' => 'name', 'value' => 'isindex'); 2052 2053 $this->inBody(array( 2054 'name' => 'input', 2055 'type' => HTML5::STARTTAG, 2056 'attr' => $attr 2057 )); 2058 2059 /* Act as if a stream of character tokens had been seen 2060 (see below for what they should say). */ 2061 $this->insertText('This is a searchable index. '. 2062 'Insert your search keywords here: '); 2063 2064 /* Act as if an end tag token with the tag name "label" 2065 had been seen. */ 2066 $this->inBody(array( 2067 'name' => 'label', 2068 'type' => HTML5::ENDTAG 2069 )); 2070 2071 /* Act as if an end tag token with the tag name "p" had 2072 been seen. */ 2073 $this->inBody(array( 2074 'name' => 'p', 2075 'type' => HTML5::ENDTAG 2076 )); 2077 2078 /* Act as if a start tag token with the tag name "hr" had 2079 been seen. */ 2080 $this->inBody(array( 2081 'name' => 'hr', 2082 'type' => HTML5::ENDTAG 2083 )); 2084 2085 /* Act as if an end tag token with the tag name "form" had 2086 been seen. */ 2087 $this->inBody(array( 2088 'name' => 'form', 2089 'type' => HTML5::ENDTAG 2090 )); 2091 } 2092 break; 2093 2094 /* A start tag whose tag name is "textarea" */ 2095 case 'textarea': 2096 $this->insertElement($token); 2097 2098 /* Switch the tokeniser's content model flag to the 2099 RCDATA state. */ 2100 return HTML5::RCDATA; 2101 break; 2102 2103 /* A start tag whose tag name is one of: "iframe", "noembed", 2104 "noframes" */ 2105 case 'iframe': case 'noembed': case 'noframes': 2106 $this->insertElement($token); 2107 2108 /* Switch the tokeniser's content model flag to the CDATA state. */ 2109 return HTML5::CDATA; 2110 break; 2111 2112 /* A start tag whose tag name is "select" */ 2113 case 'select': 2114 /* Reconstruct the active formatting elements, if any. */ 2115 $this->reconstructActiveFormattingElements(); 2116 2117 /* Insert an HTML element for the token. */ 2118 $this->insertElement($token); 2119 2120 /* Change the insertion mode to "in select". */ 2121 $this->mode = self::IN_SELECT; 2122 break; 2123 2124 /* A start or end tag whose tag name is one of: "caption", "col", 2125 "colgroup", "frame", "frameset", "head", "option", "optgroup", 2126 "tbody", "td", "tfoot", "th", "thead", "tr". */ 2127 case 'caption': case 'col': case 'colgroup': case 'frame': 2128 case 'frameset': case 'head': case 'option': case 'optgroup': 2129 case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead': 2130 case 'tr': 2131 // Parse error. Ignore the token. 2132 break; 2133 2134 /* A start or end tag whose tag name is one of: "event-source", 2135 "section", "nav", "article", "aside", "header", "footer", 2136 "datagrid", "command" */ 2137 case 'event-source': case 'section': case 'nav': case 'article': 2138 case 'aside': case 'header': case 'footer': case 'datagrid': 2139 case 'command': 2140 // Work in progress! 2141 break; 2142 2143 /* A start tag token not covered by the previous entries */ 2144 default: 2145 /* Reconstruct the active formatting elements, if any. */ 2146 $this->reconstructActiveFormattingElements(); 2147 2148 $this->insertElement($token); 2149 break; 2150 } 2151 break; 2152 2153 case HTML5::ENDTAG: 2154 switch($token['name']) { 2155 /* An end tag with the tag name "body" */ 2156 case 'body': 2157 /* If the second element in the stack of open elements is 2158 not a body element, this is a parse error. Ignore the token. 2159 (innerHTML case) */ 2160 if(count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') { 2161 // Ignore. 2162 2163 /* If the current node is not the body element, then this 2164 is a parse error. */ 2165 } elseif(end($this->stack)->nodeName !== 'body') { 2166 // Parse error. 2167 } 2168 2169 /* Change the insertion mode to "after body". */ 2170 $this->mode = self::AFTER_BODY; 2171 break; 2172 2173 /* An end tag with the tag name "html" */ 2174 case 'html': 2175 /* Act as if an end tag with tag name "body" had been seen, 2176 then, if that token wasn't ignored, reprocess the current 2177 token. */ 2178 $this->inBody(array( 2179 'name' => 'body', 2180 'type' => HTML5::ENDTAG 2181 )); 2182 2183 return $this->afterBody($token); 2184 break; 2185 2186 /* An end tag whose tag name is one of: "address", "blockquote", 2187 "center", "dir", "div", "dl", "fieldset", "listing", "menu", 2188 "ol", "pre", "ul" */ 2189 case 'address': case 'blockquote': case 'center': case 'dir': 2190 case 'div': case 'dl': case 'fieldset': case 'listing': 2191 case 'menu': case 'ol': case 'pre': case 'ul': 2192 /* If the stack of open elements has an element in scope 2193 with the same tag name as that of the token, then generate 2194 implied end tags. */ 2195 if($this->elementInScope($token['name'])) { 2196 $this->generateImpliedEndTags(); 2197 2198 /* Now, if the current node is not an element with 2199 the same tag name as that of the token, then this 2200 is a parse error. */ 2201 // w/e 2202 2203 /* If the stack of open elements has an element in 2204 scope with the same tag name as that of the token, 2205 then pop elements from this stack until an element 2206 with that tag name has been popped from the stack. */ 2207 for($n = count($this->stack) - 1; $n >= 0; $n--) { 2208 if($this->stack[$n]->nodeName === $token['name']) { 2209 $n = -1; 2210 } 2211 2212 array_pop($this->stack); 2213 } 2214 } 2215 break; 2216 2217 /* An end tag whose tag name is "form" */ 2218 case 'form': 2219 /* If the stack of open elements has an element in scope 2220 with the same tag name as that of the token, then generate 2221 implied end tags. */ 2222 if($this->elementInScope($token['name'])) { 2223 $this->generateImpliedEndTags(); 2224 2225 } 2226 2227 if(end($this->stack)->nodeName !== $token['name']) { 2228 /* Now, if the current node is not an element with the 2229 same tag name as that of the token, then this is a parse 2230 error. */ 2231 // w/e 2232 2233 } else { 2234 /* Otherwise, if the current node is an element with 2235 the same tag name as that of the token pop that element 2236 from the stack. */ 2237 array_pop($this->stack); 2238 } 2239 2240 /* In any case, set the form element pointer to null. */ 2241 $this->form_pointer = null; 2242 break; 2243 2244 /* An end tag whose tag name is "p" */ 2245 case 'p': 2246 /* If the stack of open elements has a p element in scope, 2247 then generate implied end tags, except for p elements. */ 2248 if($this->elementInScope('p')) { 2249 $this->generateImpliedEndTags(array('p')); 2250 2251 /* If the current node is not a p element, then this is 2252 a parse error. */ 2253 // k 2254 2255 /* If the stack of open elements has a p element in 2256 scope, then pop elements from this stack until the stack 2257 no longer has a p element in scope. */ 2258 for($n = count($this->stack) - 1; $n >= 0; $n--) { 2259 if($this->elementInScope('p')) { 2260 array_pop($this->stack); 2261 2262 } else { 2263 break; 2264 } 2265 } 2266 } 2267 break; 2268 2269 /* An end tag whose tag name is "dd", "dt", or "li" */ 2270 case 'dd': case 'dt': case 'li': 2271 /* If the stack of open elements has an element in scope 2272 whose tag name matches the tag name of the token, then 2273 generate implied end tags, except for elements with the 2274 same tag name as the token. */ 2275 if($this->elementInScope($token['name'])) { 2276 $this->generateImpliedEndTags(array($token['name'])); 2277 2278 /* If the current node is not an element with the same 2279 tag name as the token, then this is a parse error. */ 2280 // w/e 2281 2282 /* If the stack of open elements has an element in scope 2283 whose tag name matches the tag name of the token, then 2284 pop elements from this stack until an element with that 2285 tag name has been popped from the stack. */ 2286 for($n = count($this->stack) - 1; $n >= 0; $n--) { 2287 if($this->stack[$n]->nodeName === $token['name']) { 2288 $n = -1; 2289 } 2290 2291 array_pop($this->stack); 2292 } 2293 } 2294 break; 2295 2296 /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4", 2297 "h5", "h6" */ 2298 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': 2299 $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'); 2300 2301 /* If the stack of open elements has in scope an element whose 2302 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then 2303 generate implied end tags. */ 2304 if($this->elementInScope($elements)) { 2305 $this->generateImpliedEndTags(); 2306 2307 /* Now, if the current node is not an element with the same 2308 tag name as that of the token, then this is a parse error. */ 2309 // w/e 2310 2311 /* If the stack of open elements has in scope an element 2312 whose tag name is one of "h1", "h2", "h3", "h4", "h5", or 2313 "h6", then pop elements from the stack until an element 2314 with one of those tag names has been popped from the stack. */ 2315 while($this->elementInScope($elements)) { 2316 array_pop($this->stack); 2317 } 2318 } 2319 break; 2320 2321 /* An end tag whose tag name is one of: "a", "b", "big", "em", 2322 "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */ 2323 case 'a': case 'b': case 'big': case 'em': case 'font': 2324 case 'i': case 'nobr': case 's': case 'small': case 'strike': 2325 case 'strong': case 'tt': case 'u': 2326 /* 1. Let the formatting element be the last element in 2327 the list of active formatting elements that: 2328 * is between the end of the list and the last scope 2329 marker in the list, if any, or the start of the list 2330