[ Index ]

PHP Cross Reference of Moodle 1.9.3 [Build 15-Oct-2008]

title

Body

[close]

/lib/htmlpurifier/HTMLPurifier/Lexer/ -> PH5P.php (source)

   1  <?php
   2  
   3  require_once 'HTMLPurifier/Lexer/DOMLex.php';
   4  
   5  /**

   6   * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.

   7   * Requires PHP5, and occupies space in the HTML5 pseudo-namespace (may

   8   * cause conflicts, sorry).

   9   */
  10  
  11  class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex {
  12      
  13      public function tokenizeHTML($html, $config, &$context) {
  14          $html = $this->normalize($html, $config, $context);
  15          $html = $this->wrapHTML( $html, $config, $context);
  16          $parser = new HTML5($html);
  17          $doc = $parser->save();
  18          $tokens = array();
  19          $this->tokenizeDOM(
  20              $doc->getElementsByTagName('html')->item(0)-> // <html>
  21                    getElementsByTagName('body')->item(0)-> //   <body>
  22                    getElementsByTagName('div')->item(0)    //     <div>
  23              , $tokens);
  24          return $tokens;
  25      }
  26      
  27  }
  28  
  29  /*

  30  

  31  Copyright 2007 Jeroen van der Meer <http://jero.net/> 

  32  

  33  Permission is hereby granted, free of charge, to any person obtaining a 

  34  copy of this software and associated documentation files (the 

  35  "Software"), to deal in the Software without restriction, including 

  36  without limitation the rights to use, copy, modify, merge, publish, 

  37  distribute, sublicense, and/or sell copies of the Software, and to 

  38  permit persons to whom the Software is furnished to do so, subject to 

  39  the following conditions: 

  40  

  41  The above copyright notice and this permission notice shall be included 

  42  in all copies or substantial portions of the Software. 

  43  

  44  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 

  45  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 

  46  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 

  47  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 

  48  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 

  49  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 

  50  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 

  51  

  52  */
  53  
  54  class HTML5 {
  55      private $data;
  56      private $char;
  57      private $EOF;
  58      private $state;
  59      private $tree;
  60      private $token;
  61      private $content_model;
  62      private $escape = false;
  63      private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute',
  64      'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;',
  65      'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;',
  66      'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;',
  67      'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;',
  68      'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;',
  69      'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;',
  70      'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;',
  71      'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;',
  72      'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN',
  73      'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;',
  74      'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;',
  75      'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig',
  76      'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;',
  77      'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;',
  78      'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil',
  79      'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;',
  80      'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;',
  81      'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;',
  82      'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth',
  83      'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12',
  84      'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt',
  85      'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc',
  86      'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;',
  87      'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;',
  88      'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;',
  89      'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro',
  90      'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;',
  91      'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;',
  92      'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;',
  93      'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash',
  94      'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;',
  95      'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;',
  96      'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;',
  97      'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;',
  98      'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;',
  99      'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;',
 100      'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;',
 101      'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;',
 102      'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc',
 103      'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;',
 104      'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;');
 105  
 106      const PCDATA    = 0;
 107      const RCDATA    = 1;
 108      const CDATA     = 2;
 109      const PLAINTEXT = 3;
 110  
 111      const DOCTYPE  = 0;
 112      const STARTTAG = 1;
 113      const ENDTAG   = 2;
 114      const COMMENT  = 3;
 115      const CHARACTR = 4;
 116      const EOF      = 5;
 117  
 118      public function __construct($data) {
 119          $data = str_replace("\r\n", "\n", $data);
 120          $date = str_replace("\r", null, $data);
 121  
 122          $this->data = $data;
 123          $this->char = -1;
 124          $this->EOF  = strlen($data);
 125          $this->tree = new HTML5TreeConstructer;
 126          $this->content_model = self::PCDATA;
 127  
 128          $this->state = 'data';
 129  
 130          while($this->state !== null) {
 131              $this->{$this->state.'State'}();
 132          }
 133      }
 134  
 135      public function save() {
 136          return $this->tree->save();
 137      }
 138  
 139      private function char() {
 140          return ($this->char < $this->EOF)
 141              ? $this->data[$this->char]
 142              : false;
 143      }
 144  
 145      private function character($s, $l = 0) {
 146          if($s + $l < $this->EOF) {
 147              if($l === 0) {
 148                  return $this->data[$s];
 149              } else {
 150                  return substr($this->data, $s, $l);
 151              }
 152          }
 153      }
 154  
 155      private function characters($char_class, $start) {
 156          return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start));
 157      }
 158  
 159      private function dataState() {
 160          // Consume the next input character

 161          $this->char++;
 162          $char = $this->char();
 163  
 164          if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
 165              /* U+0026 AMPERSAND (&)

 166              When the content model flag is set to one of the PCDATA or RCDATA

 167              states: switch to the entity data state. Otherwise: treat it as per

 168              the "anything else"    entry below. */
 169              $this->state = 'entityData';
 170  
 171          } elseif($char === '-') {
 172              /* If the content model flag is set to either the RCDATA state or

 173              the CDATA state, and the escape flag is false, and there are at

 174              least three characters before this one in the input stream, and the

 175              last four characters in the input stream, including this one, are

 176              U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,

 177              and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
 178              if(($this->content_model === self::RCDATA || $this->content_model ===
 179              self::CDATA) && $this->escape === false &&
 180              $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--') {
 181                  $this->escape = true;
 182              }
 183  
 184              /* In any case, emit the input character as a character token. Stay

 185              in the data state. */
 186              $this->emitToken(array(
 187                  'type' => self::CHARACTR,
 188                  'data' => $char
 189              ));
 190  
 191          /* U+003C LESS-THAN SIGN (<) */

 192          } elseif($char === '<' && ($this->content_model === self::PCDATA ||
 193          (($this->content_model === self::RCDATA ||
 194          $this->content_model === self::CDATA) && $this->escape === false))) {
 195              /* When the content model flag is set to the PCDATA state: switch

 196              to the tag open state.

 197  

 198              When the content model flag is set to either the RCDATA state or

 199              the CDATA state and the escape flag is false: switch to the tag

 200              open state.

 201  

 202              Otherwise: treat it as per the "anything else" entry below. */
 203              $this->state = 'tagOpen';
 204  
 205          /* U+003E GREATER-THAN SIGN (>) */

 206          } elseif($char === '>') {
 207              /* If the content model flag is set to either the RCDATA state or

 208              the CDATA state, and the escape flag is true, and the last three

 209              characters in the input stream including this one are U+002D

 210              HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),

 211              set the escape flag to false. */
 212              if(($this->content_model === self::RCDATA ||
 213              $this->content_model === self::CDATA) && $this->escape === true &&
 214              $this->character($this->char, 3) === '-->') {
 215                  $this->escape = false;
 216              }
 217  
 218              /* In any case, emit the input character as a character token.

 219              Stay in the data state. */
 220              $this->emitToken(array(
 221                  'type' => self::CHARACTR,
 222                  'data' => $char
 223              ));
 224  
 225          } elseif($this->char === $this->EOF) {
 226              /* EOF

 227              Emit an end-of-file token. */
 228              $this->EOF();
 229  
 230          } elseif($this->content_model === self::PLAINTEXT) {
 231              /* When the content model flag is set to the PLAINTEXT state

 232              THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of

 233              the text and emit it as a character token. */
 234              $this->emitToken(array(
 235                  'type' => self::CHARACTR,
 236                  'data' => substr($this->data, $this->char)
 237              ));
 238  
 239              $this->EOF();
 240  
 241          } else {
 242              /* Anything else

 243              THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that

 244              otherwise would also be treated as a character token and emit it

 245              as a single character token. Stay in the data state. */
 246              $len  = strcspn($this->data, '<&', $this->char);
 247              $char = substr($this->data, $this->char, $len);
 248              $this->char += $len - 1;
 249  
 250              $this->emitToken(array(
 251                  'type' => self::CHARACTR,
 252                  'data' => $char
 253              ));
 254  
 255              $this->state = 'data';
 256          }
 257      }
 258  
 259      private function entityDataState() {
 260          // Attempt to consume an entity.

 261          $entity = $this->entity();
 262  
 263          // If nothing is returned, emit a U+0026 AMPERSAND character token.

 264          // Otherwise, emit the character token that was returned.

 265          $char = (!$entity) ? '&' : $entity;
 266          $this->emitToken(array(
 267              'type' => self::CHARACTR,
 268              'data' => $char
 269          ));
 270  
 271          // Finally, switch to the data state.

 272          $this->state = 'data';
 273      }
 274  
 275      private function tagOpenState() {
 276          switch($this->content_model) {
 277              case self::RCDATA:
 278              case self::CDATA:
 279                  /* If the next input character is a U+002F SOLIDUS (/) character,

 280                  consume it and switch to the close tag open state. If the next

 281                  input character is not a U+002F SOLIDUS (/) character, emit a

 282                  U+003C LESS-THAN SIGN character token and switch to the data

 283                  state to process the next input character. */
 284                  if($this->character($this->char + 1) === '/') {
 285                      $this->char++;
 286                      $this->state = 'closeTagOpen';
 287  
 288                  } else {
 289                      $this->emitToken(array(
 290                          'type' => self::CHARACTR,
 291                          'data' => '<'
 292                      ));
 293  
 294                      $this->state = 'data';
 295                  }
 296              break;
 297  
 298              case self::PCDATA:
 299                  // If the content model flag is set to the PCDATA state

 300                  // Consume the next input character:

 301                  $this->char++;
 302                  $char = $this->char();
 303  
 304                  if($char === '!') {
 305                      /* U+0021 EXCLAMATION MARK (!)

 306                      Switch to the markup declaration open state. */
 307                      $this->state = 'markupDeclarationOpen';
 308  
 309                  } elseif($char === '/') {
 310                      /* U+002F SOLIDUS (/)

 311                      Switch to the close tag open state. */
 312                      $this->state = 'closeTagOpen';
 313  
 314                  } elseif(preg_match('/^[A-Za-z]$/', $char)) {
 315                      /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z

 316                      Create a new start tag token, set its tag name to the lowercase

 317                      version of the input character (add 0x0020 to the character's code

 318                      point), then switch to the tag name state. (Don't emit the token

 319                      yet; further details will be filled in before it is emitted.) */
 320                      $this->token = array(
 321                          'name'  => strtolower($char),
 322                          'type'  => self::STARTTAG,
 323                          'attr'  => array()
 324                      );
 325  
 326                      $this->state = 'tagName';
 327  
 328                  } elseif($char === '>') {
 329                      /* U+003E GREATER-THAN SIGN (>)

 330                      Parse error. Emit a U+003C LESS-THAN SIGN character token and a

 331                      U+003E GREATER-THAN SIGN character token. Switch to the data state. */
 332                      $this->emitToken(array(
 333                          'type' => self::CHARACTR,
 334                          'data' => '<>'
 335                      ));
 336  
 337                      $this->state = 'data';
 338  
 339                  } elseif($char === '?') {
 340                      /* U+003F QUESTION MARK (?)

 341                      Parse error. Switch to the bogus comment state. */
 342                      $this->state = 'bogusComment';
 343  
 344                  } else {
 345                      /* Anything else

 346                      Parse error. Emit a U+003C LESS-THAN SIGN character token and

 347                      reconsume the current input character in the data state. */
 348                      $this->emitToken(array(
 349                          'type' => self::CHARACTR,
 350                          'data' => '<'
 351                      ));
 352  
 353                      $this->char--;
 354                      $this->state = 'data';
 355                  }
 356              break;
 357          }
 358      }
 359  
 360      private function closeTagOpenState() {
 361          $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
 362          $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
 363  
 364          if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
 365          (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/',
 366          $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) {
 367              /* If the content model flag is set to the RCDATA or CDATA states then

 368              examine the next few characters. If they do not match the tag name of

 369              the last start tag token emitted (case insensitively), or if they do but

 370              they are not immediately followed by one of the following characters:

 371                  * U+0009 CHARACTER TABULATION

 372                  * U+000A LINE FEED (LF)

 373                  * U+000B LINE TABULATION

 374                  * U+000C FORM FEED (FF)

 375                  * U+0020 SPACE

 376                  * U+003E GREATER-THAN SIGN (>)

 377                  * U+002F SOLIDUS (/)

 378                  * EOF

 379              ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character

 380              token, a U+002F SOLIDUS character token, and switch to the data state

 381              to process the next input character. */
 382              $this->emitToken(array(
 383                  'type' => self::CHARACTR,
 384                  'data' => '</'
 385              ));
 386  
 387              $this->state = 'data';
 388  
 389          } else {
 390              /* Otherwise, if the content model flag is set to the PCDATA state,

 391              or if the next few characters do match that tag name, consume the

 392              next input character: */
 393              $this->char++;
 394              $char = $this->char();
 395  
 396              if(preg_match('/^[A-Za-z]$/', $char)) {
 397                  /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z

 398                  Create a new end tag token, set its tag name to the lowercase version

 399                  of the input character (add 0x0020 to the character's code point), then

 400                  switch to the tag name state. (Don't emit the token yet; further details

 401                  will be filled in before it is emitted.) */
 402                  $this->token = array(
 403                      'name'  => strtolower($char),
 404                      'type'  => self::ENDTAG
 405                  );
 406  
 407                  $this->state = 'tagName';
 408  
 409              } elseif($char === '>') {
 410                  /* U+003E GREATER-THAN SIGN (>)

 411                  Parse error. Switch to the data state. */
 412                  $this->state = 'data';
 413  
 414              } elseif($this->char === $this->EOF) {
 415                  /* EOF

 416                  Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F

 417                  SOLIDUS character token. Reconsume the EOF character in the data state. */
 418                  $this->emitToken(array(
 419                      'type' => self::CHARACTR,
 420                      'data' => '</'
 421                  ));
 422  
 423                  $this->char--;
 424                  $this->state = 'data';
 425  
 426              } else {
 427                  /* Parse error. Switch to the bogus comment state. */

 428                  $this->state = 'bogusComment';
 429              }
 430          }
 431      }
 432  
 433      private function tagNameState() {
 434          // Consume the next input character:

 435          $this->char++;
 436          $char = $this->character($this->char);
 437  
 438          if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
 439              /* U+0009 CHARACTER TABULATION

 440              U+000A LINE FEED (LF)

 441              U+000B LINE TABULATION

 442              U+000C FORM FEED (FF)

 443              U+0020 SPACE

 444              Switch to the before attribute name state. */
 445              $this->state = 'beforeAttributeName';
 446  
 447          } elseif($char === '>') {
 448              /* U+003E GREATER-THAN SIGN (>)

 449              Emit the current tag token. Switch to the data state. */
 450              $this->emitToken($this->token);
 451              $this->state = 'data';
 452  
 453          } elseif($this->char === $this->EOF) {
 454              /* EOF

 455              Parse error. Emit the current tag token. Reconsume the EOF

 456              character in the data state. */
 457              $this->emitToken($this->token);
 458  
 459              $this->char--;
 460              $this->state = 'data';
 461  
 462          } elseif($char === '/') {
 463              /* U+002F SOLIDUS (/)

 464              Parse error unless this is a permitted slash. Switch to the before

 465              attribute name state. */
 466              $this->state = 'beforeAttributeName';
 467  
 468          } else {
 469              /* Anything else

 470              Append the current input character to the current tag token's tag name.

 471              Stay in the tag name state. */
 472              $this->token['name'] .= strtolower($char);
 473              $this->state = 'tagName';
 474          }
 475      }
 476  
 477      private function beforeAttributeNameState() {
 478          // Consume the next input character:

 479          $this->char++;
 480          $char = $this->character($this->char);
 481  
 482          if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
 483              /* U+0009 CHARACTER TABULATION

 484              U+000A LINE FEED (LF)

 485              U+000B LINE TABULATION

 486              U+000C FORM FEED (FF)

 487              U+0020 SPACE

 488              Stay in the before attribute name state. */
 489              $this->state = 'beforeAttributeName';
 490  
 491          } elseif($char === '>') {
 492              /* U+003E GREATER-THAN SIGN (>)

 493              Emit the current tag token. Switch to the data state. */
 494              $this->emitToken($this->token);
 495              $this->state = 'data';
 496  
 497          } elseif($char === '/') {
 498              /* U+002F SOLIDUS (/)

 499              Parse error unless this is a permitted slash. Stay in the before

 500              attribute name state. */
 501              $this->state = 'beforeAttributeName';
 502  
 503          } elseif($this->char === $this->EOF) {
 504              /* EOF

 505              Parse error. Emit the current tag token. Reconsume the EOF

 506              character in the data state. */
 507              $this->emitToken($this->token);
 508  
 509              $this->char--;
 510              $this->state = 'data';
 511  
 512          } else {
 513              /* Anything else

 514              Start a new attribute in the current tag token. Set that attribute's

 515              name to the current input character, and its value to the empty string.

 516              Switch to the attribute name state. */
 517              $this->token['attr'][] = array(
 518                  'name'  => strtolower($char),
 519                  'value' => null
 520              );
 521  
 522              $this->state = 'attributeName';
 523          }
 524      }
 525  
 526      private function attributeNameState() {
 527          // Consume the next input character:

 528          $this->char++;
 529          $char = $this->character($this->char);
 530  
 531          if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
 532              /* U+0009 CHARACTER TABULATION

 533              U+000A LINE FEED (LF)

 534              U+000B LINE TABULATION

 535              U+000C FORM FEED (FF)

 536              U+0020 SPACE

 537              Stay in the before attribute name state. */
 538              $this->state = 'afterAttributeName';
 539  
 540          } elseif($char === '=') {
 541              /* U+003D EQUALS SIGN (=)

 542              Switch to the before attribute value state. */
 543              $this->state = 'beforeAttributeValue';
 544  
 545          } elseif($char === '>') {
 546              /* U+003E GREATER-THAN SIGN (>)

 547              Emit the current tag token. Switch to the data state. */
 548              $this->emitToken($this->token);
 549              $this->state = 'data';
 550  
 551          } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
 552              /* U+002F SOLIDUS (/)

 553              Parse error unless this is a permitted slash. Switch to the before

 554              attribute name state. */
 555              $this->state = 'beforeAttributeName';
 556  
 557          } elseif($this->char === $this->EOF) {
 558              /* EOF

 559              Parse error. Emit the current tag token. Reconsume the EOF

 560              character in the data state. */
 561              $this->emitToken($this->token);
 562  
 563              $this->char--;
 564              $this->state = 'data';
 565  
 566          } else {
 567              /* Anything else

 568              Append the current input character to the current attribute's name.

 569              Stay in the attribute name state. */
 570              $last = count($this->token['attr']) - 1;
 571              $this->token['attr'][$last]['name'] .= strtolower($char);
 572  
 573              $this->state = 'attributeName';
 574          }
 575      }
 576  
 577      private function afterAttributeNameState() {
 578          // Consume the next input character:

 579          $this->char++;
 580          $char = $this->character($this->char);
 581  
 582          if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
 583              /* U+0009 CHARACTER TABULATION

 584              U+000A LINE FEED (LF)

 585              U+000B LINE TABULATION

 586              U+000C FORM FEED (FF)

 587              U+0020 SPACE

 588              Stay in the after attribute name state. */
 589              $this->state = 'afterAttributeName';
 590  
 591          } elseif($char === '=') {
 592              /* U+003D EQUALS SIGN (=)

 593              Switch to the before attribute value state. */
 594              $this->state = 'beforeAttributeValue';
 595  
 596          } elseif($char === '>') {
 597              /* U+003E GREATER-THAN SIGN (>)

 598              Emit the current tag token. Switch to the data state. */
 599              $this->emitToken($this->token);
 600              $this->state = 'data';
 601  
 602          } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
 603              /* U+002F SOLIDUS (/)

 604              Parse error unless this is a permitted slash. Switch to the

 605              before attribute name state. */
 606              $this->state = 'beforeAttributeName';
 607  
 608          } elseif($this->char === $this->EOF) {
 609              /* EOF

 610              Parse error. Emit the current tag token. Reconsume the EOF

 611              character in the data state. */
 612              $this->emitToken($this->token);
 613  
 614              $this->char--;
 615              $this->state = 'data';
 616  
 617          } else {
 618              /* Anything else

 619              Start a new attribute in the current tag token. Set that attribute's

 620              name to the current input character, and its value to the empty string.

 621              Switch to the attribute name state. */
 622              $this->token['attr'][] = array(
 623                  'name'  => strtolower($char),
 624                  'value' => null
 625              );
 626  
 627              $this->state = 'attributeName';
 628          }
 629      }
 630  
 631      private function beforeAttributeValueState() {
 632          // Consume the next input character:

 633          $this->char++;
 634          $char = $this->character($this->char);
 635  
 636          if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
 637              /* U+0009 CHARACTER TABULATION

 638              U+000A LINE FEED (LF)

 639              U+000B LINE TABULATION

 640              U+000C FORM FEED (FF)

 641              U+0020 SPACE

 642              Stay in the before attribute value state. */
 643              $this->state = 'beforeAttributeValue';
 644  
 645          } elseif($char === '"') {
 646              /* U+0022 QUOTATION MARK (")

 647              Switch to the attribute value (double-quoted) state. */
 648              $this->state = 'attributeValueDoubleQuoted';
 649  
 650          } elseif($char === '&') {
 651              /* U+0026 AMPERSAND (&)

 652              Switch to the attribute value (unquoted) state and reconsume

 653              this input character. */
 654              $this->char--;
 655              $this->state = 'attributeValueUnquoted';
 656  
 657          } elseif($char === '\'') {
 658              /* U+0027 APOSTROPHE (')

 659              Switch to the attribute value (single-quoted) state. */
 660              $this->state = 'attributeValueSingleQuoted';
 661  
 662          } elseif($char === '>') {
 663              /* U+003E GREATER-THAN SIGN (>)

 664              Emit the current tag token. Switch to the data state. */
 665              $this->emitToken($this->token);
 666              $this->state = 'data';
 667  
 668          } else {
 669              /* Anything else

 670              Append the current input character to the current attribute's value.

 671              Switch to the attribute value (unquoted) state. */
 672              $last = count($this->token['attr']) - 1;
 673              $this->token['attr'][$last]['value'] .= $char;
 674  
 675              $this->state = 'attributeValueUnquoted';
 676          }
 677      }
 678  
 679      private function attributeValueDoubleQuotedState() {
 680          // Consume the next input character:

 681          $this->char++;
 682          $char = $this->character($this->char);
 683  
 684          if($char === '"') {
 685              /* U+0022 QUOTATION MARK (")

 686              Switch to the before attribute name state. */
 687              $this->state = 'beforeAttributeName';
 688  
 689          } elseif($char === '&') {
 690              /* U+0026 AMPERSAND (&)

 691              Switch to the entity in attribute value state. */
 692              $this->entityInAttributeValueState('double');
 693  
 694          } elseif($this->char === $this->EOF) {
 695              /* EOF

 696              Parse error. Emit the current tag token. Reconsume the character

 697              in the data state. */
 698              $this->emitToken($this->token);
 699  
 700              $this->char--;
 701              $this->state = 'data';
 702  
 703          } else {
 704              /* Anything else

 705              Append the current input character to the current attribute's value.

 706              Stay in the attribute value (double-quoted) state. */
 707              $last = count($this->token['attr']) - 1;
 708              $this->token['attr'][$last]['value'] .= $char;
 709  
 710              $this->state = 'attributeValueDoubleQuoted';
 711          }
 712      }
 713  
 714      private function attributeValueSingleQuotedState() {
 715          // Consume the next input character:

 716          $this->char++;
 717          $char = $this->character($this->char);
 718  
 719          if($char === '\'') {
 720              /* U+0022 QUOTATION MARK (')

 721              Switch to the before attribute name state. */
 722              $this->state = 'beforeAttributeName';
 723  
 724          } elseif($char === '&') {
 725              /* U+0026 AMPERSAND (&)

 726              Switch to the entity in attribute value state. */
 727              $this->entityInAttributeValueState('single');
 728  
 729          } elseif($this->char === $this->EOF) {
 730              /* EOF

 731              Parse error. Emit the current tag token. Reconsume the character

 732              in the data state. */
 733              $this->emitToken($this->token);
 734  
 735              $this->char--;
 736              $this->state = 'data';
 737  
 738          } else {
 739              /* Anything else

 740              Append the current input character to the current attribute's value.

 741              Stay in the attribute value (single-quoted) state. */
 742              $last = count($this->token['attr']) - 1;
 743              $this->token['attr'][$last]['value'] .= $char;
 744  
 745              $this->state = 'attributeValueSingleQuoted';
 746          }
 747      }
 748  
 749      private function attributeValueUnquotedState() {
 750          // Consume the next input character:

 751          $this->char++;
 752          $char = $this->character($this->char);
 753  
 754          if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
 755              /* U+0009 CHARACTER TABULATION

 756              U+000A LINE FEED (LF)

 757              U+000B LINE TABULATION

 758              U+000C FORM FEED (FF)

 759              U+0020 SPACE

 760              Switch to the before attribute name state. */
 761              $this->state = 'beforeAttributeName';
 762  
 763          } elseif($char === '&') {
 764              /* U+0026 AMPERSAND (&)

 765              Switch to the entity in attribute value state. */
 766              $this->entityInAttributeValueState();
 767  
 768          } elseif($char === '>') {
 769              /* U+003E GREATER-THAN SIGN (>)

 770              Emit the current tag token. Switch to the data state. */
 771              $this->emitToken($this->token);
 772              $this->state = 'data';
 773  
 774          } else {
 775              /* Anything else

 776              Append the current input character to the current attribute's value.

 777              Stay in the attribute value (unquoted) state. */
 778              $last = count($this->token['attr']) - 1;
 779              $this->token['attr'][$last]['value'] .= $char;
 780  
 781              $this->state = 'attributeValueUnquoted';
 782          }
 783      }
 784  
 785      private function entityInAttributeValueState() {
 786          // Attempt to consume an entity.

 787          $entity = $this->entity();
 788  
 789          // If nothing is returned, append a U+0026 AMPERSAND character to the

 790          // current attribute's value. Otherwise, emit the character token that

 791          // was returned.

 792          $char = (!$entity)
 793              ? '&'
 794              : $entity;
 795  
 796          $last = count($this->token['attr']) - 1;
 797          $this->token['attr'][$last]['value'] .= $char;
 798      }
 799  
 800      private function bogusCommentState() {
 801          /* Consume every character up to the first U+003E GREATER-THAN SIGN

 802          character (>) or the end of the file (EOF), whichever comes first. Emit

 803          a comment token whose data is the concatenation of all the characters

 804          starting from and including the character that caused the state machine

 805          to switch into the bogus comment state, up to and including the last

 806          consumed character before the U+003E character, if any, or up to the

 807          end of the file otherwise. (If the comment was started by the end of

 808          the file (EOF), the token is empty.) */
 809          $data = $this->characters('^>', $this->char);
 810          $this->emitToken(array(
 811              'data' => $data,
 812              'type' => self::COMMENT
 813          ));
 814  
 815          $this->char += strlen($data);
 816  
 817          /* Switch to the data state. */

 818          $this->state = 'data';
 819  
 820          /* If the end of the file was reached, reconsume the EOF character. */

 821          if($this->char === $this->EOF) {
 822              $this->char = $this->EOF - 1;
 823          }
 824      }
 825  
 826      private function markupDeclarationOpenState() {
 827          /* If the next two characters are both U+002D HYPHEN-MINUS (-)

 828          characters, consume those two characters, create a comment token whose

 829          data is the empty string, and switch to the comment state. */
 830          if($this->character($this->char + 1, 2) === '--') {
 831              $this->char += 2;
 832              $this->state = 'comment';
 833              $this->token = array(
 834                  'data' => null,
 835                  'type' => self::COMMENT
 836              );
 837  
 838          /* Otherwise if the next seven chacacters are a case-insensitive match

 839          for the word "DOCTYPE", then consume those characters and switch to the

 840          DOCTYPE state. */
 841          } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') {
 842              $this->char += 7;
 843              $this->state = 'doctype';
 844  
 845          /* Otherwise, is is a parse error. Switch to the bogus comment state.

 846          The next character that is consumed, if any, is the first character

 847          that will be in the comment. */
 848          } else {
 849              $this->char++;
 850              $this->state = 'bogusComment';
 851          }
 852      }
 853  
 854      private function commentState() {
 855          /* Consume the next input character: */

 856          $this->char++;
 857          $char = $this->char();
 858  
 859          /* U+002D HYPHEN-MINUS (-) */

 860          if($char === '-') {
 861              /* Switch to the comment dash state  */

 862              $this->state = 'commentDash';
 863  
 864          /* EOF */

 865          } elseif($this->char === $this->EOF) {
 866              /* Parse error. Emit the comment token. Reconsume the EOF character

 867              in the data state. */
 868              $this->emitToken($this->token);
 869              $this->char--;
 870              $this->state = 'data';
 871  
 872          /* Anything else */

 873          } else {
 874              /* Append the input character to the comment token's data. Stay in

 875              the comment state. */
 876              $this->token['data'] .= $char;
 877          }
 878      }
 879  
 880      private function commentDashState() {
 881          /* Consume the next input character: */

 882          $this->char++;
 883          $char = $this->char();
 884  
 885          /* U+002D HYPHEN-MINUS (-) */

 886          if($char === '-') {
 887              /* Switch to the comment end state  */

 888              $this->state = 'commentEnd';
 889  
 890          /* EOF */

 891          } elseif($this->char === $this->EOF) {
 892              /* Parse error. Emit the comment token. Reconsume the EOF character

 893              in the data state. */
 894              $this->emitToken($this->token);
 895              $this->char--;
 896              $this->state = 'data';
 897  
 898          /* Anything else */

 899          } else {
 900              /* Append a U+002D HYPHEN-MINUS (-) character and the input

 901              character to the comment token's data. Switch to the comment state. */
 902              $this->token['data'] .= '-'.$char;
 903              $this->state = 'comment';
 904          }
 905      }
 906  
 907      private function commentEndState() {
 908          /* Consume the next input character: */

 909          $this->char++;
 910          $char = $this->char();
 911  
 912          if($char === '>') {
 913              $this->emitToken($this->token);
 914              $this->state = 'data';
 915  
 916          } elseif($char === '-') {
 917              $this->token['data'] .= '-';
 918  
 919          } elseif($this->char === $this->EOF) {
 920              $this->emitToken($this->token);
 921              $this->char--;
 922              $this->state = 'data';
 923  
 924          } else {
 925              $this->token['data'] .= '--'.$char;
 926              $this->state = 'comment';
 927          }
 928      }
 929  
 930      private function doctypeState() {
 931          /* Consume the next input character: */

 932          $this->char++;
 933          $char = $this->char();
 934  
 935          if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
 936              $this->state = 'beforeDoctypeName';
 937  
 938          } else {
 939              $this->char--;
 940              $this->state = 'beforeDoctypeName';
 941          }
 942      }
 943  
 944      private function beforeDoctypeNameState() {
 945          /* Consume the next input character: */

 946          $this->char++;
 947          $char = $this->char();
 948  
 949          if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
 950              // Stay in the before DOCTYPE name state.

 951  
 952          } elseif(preg_match('/^[a-z]$/', $char)) {
 953              $this->token = array(
 954                  'name' => strtoupper($char),
 955                  'type' => self::DOCTYPE,
 956                  'error' => true
 957              );
 958  
 959              $this->state = 'doctypeName';
 960  
 961          } elseif($char === '>') {
 962              $this->emitToken(array(
 963                  'name' => null,
 964                  'type' => self::DOCTYPE,
 965                  'error' => true
 966              ));
 967  
 968              $this->state = 'data';
 969  
 970          } elseif($this->char === $this->EOF) {
 971              $this->emitToken(array(
 972                  'name' => null,
 973                  'type' => self::DOCTYPE,
 974                  'error' => true
 975              ));
 976  
 977              $this->char--;
 978              $this->state = 'data';
 979  
 980          } else {
 981              $this->token = array(
 982                  'name' => $char,
 983                  'type' => self::DOCTYPE,
 984                  'error' => true
 985              );
 986  
 987              $this->state = 'doctypeName';
 988          }
 989      }
 990  
 991      private function doctypeNameState() {
 992          /* Consume the next input character: */

 993          $this->char++;
 994          $char = $this->char();
 995  
 996          if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
 997              $this->state = 'AfterDoctypeName';
 998  
 999          } elseif($char === '>') {
1000              $this->emitToken($this->token);
1001              $this->state = 'data';
1002  
1003          } elseif(preg_match('/^[a-z]$/', $char)) {
1004              $this->token['name'] .= strtoupper($char);
1005  
1006          } elseif($this->char === $this->EOF) {
1007              $this->emitToken($this->token);
1008              $this->char--;
1009              $this->state = 'data';
1010  
1011          } else {
1012              $this->token['name'] .= $char;
1013          }
1014  
1015          $this->token['error'] = ($this->token['name'] === 'HTML')
1016              ? false
1017              : true;
1018      }
1019  
1020      private function afterDoctypeNameState() {
1021          /* Consume the next input character: */

1022          $this->char++;
1023          $char = $this->char();
1024  
1025          if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1026              // Stay in the DOCTYPE name state.

1027  
1028          } elseif($char === '>') {
1029              $this->emitToken($this->token);
1030              $this->state = 'data';
1031  
1032          } elseif($this->char === $this->EOF) {
1033              $this->emitToken($this->token);
1034              $this->char--;
1035              $this->state = 'data';
1036  
1037          } else {
1038              $this->token['error'] = true;
1039              $this->state = 'bogusDoctype';
1040          }
1041      }
1042  
1043      private function bogusDoctypeState() {
1044          /* Consume the next input character: */

1045          $this->char++;
1046          $char = $this->char();
1047  
1048          if($char === '>') {
1049              $this->emitToken($this->token);
1050              $this->state = 'data';
1051  
1052          } elseif($this->char === $this->EOF) {
1053              $this->emitToken($this->token);
1054              $this->char--;
1055              $this->state = 'data';
1056  
1057          } else {
1058              // Stay in the bogus DOCTYPE state.

1059          }
1060      }
1061  
1062      private function entity() {
1063          $start = $this->char;
1064  
1065          // This section defines how to consume an entity. This definition is

1066          // used when parsing entities in text and in attributes.

1067  
1068          // The behaviour depends on the identity of the next character (the

1069          // one immediately after the U+0026 AMPERSAND character): 

1070  
1071          switch($this->character($this->char + 1)) {
1072              // U+0023 NUMBER SIGN (#)

1073              case '#':
1074  
1075                  // The behaviour further depends on the character after the

1076                  // U+0023 NUMBER SIGN:

1077                  switch($this->character($this->char + 1)) {
1078                      // U+0078 LATIN SMALL LETTER X

1079                      // U+0058 LATIN CAPITAL LETTER X

1080                      case 'x':
1081                      case 'X':
1082                          // Follow the steps below, but using the range of

1083                          // characters U+0030 DIGIT ZERO through to U+0039 DIGIT

1084                          // NINE, U+0061 LATIN SMALL LETTER A through to U+0066

1085                          // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER

1086                          // A, through to U+0046 LATIN CAPITAL LETTER F (in other

1087                          // words, 0-9, A-F, a-f).

1088                          $char = 1;
1089                          $char_class = '0-9A-Fa-f';
1090                      break;
1091  
1092                      // Anything else

1093                      default:
1094                          // Follow the steps below, but using the range of

1095                          // characters U+0030 DIGIT ZERO through to U+0039 DIGIT

1096                          // NINE (i.e. just 0-9).

1097                          $char = 0;
1098                          $char_class = '0-9';
1099                      break;
1100                  }
1101  
1102                  // Consume as many characters as match the range of characters

1103                  // given above.

1104                  $this->char++;
1105                  $e_name = $this->characters($char_class, $this->char + $char + 1);
1106                  $entity = $this->character($start, $this->char);
1107                  $cond = strlen($e_name) > 0;
1108  
1109                  // The rest of the parsing happens bellow.

1110              break;
1111  
1112              // Anything else

1113              default:
1114                  // Consume the maximum number of characters possible, with the

1115                  // consumed characters case-sensitively matching one of the

1116                  // identifiers in the first column of the entities table.

1117                  $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1118                  $len = strlen($e_name);
1119  
1120                  for($c = 1; $c <= $len; $c++) {
1121                      $id = substr($e_name, 0, $c);
1122                      $this->char++;
1123  
1124                      if(in_array($id, $this->entities)) {
1125                          if ($e_name[$c-1] !== ';') {
1126                              if ($c < $len && $e_name[$c] == ';') {
1127                                  $this->char++; // consume extra semicolon

1128                              }
1129                          }
1130                          $entity = $id;
1131                          break;
1132                      }
1133                  }
1134  
1135                  $cond = isset($entity);
1136                  // The rest of the parsing happens bellow.

1137              break;
1138          }
1139  
1140          if(!$cond) {
1141              // If no match can be made, then this is a parse error. No

1142              // characters are consumed, and nothing is returned.

1143              $this->char = $start;
1144              return false;
1145          }
1146  
1147          // Return a character token for the character corresponding to the

1148          // entity name (as given by the second column of the entities table).

1149          return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8');
1150      }
1151  
1152      private function emitToken($token) {
1153          $emit = $this->tree->emitToken($token);
1154  
1155          if(is_int($emit)) {
1156              $this->content_model = $emit;
1157  
1158          } elseif($token['type'] === self::ENDTAG) {
1159              $this->content_model = self::PCDATA;
1160          }
1161      }
1162  
1163      private function EOF() {
1164          $this->state = null;
1165          $this->tree->emitToken(array(
1166              'type' => self::EOF
1167          ));
1168      }
1169  }
1170  
1171  class HTML5TreeConstructer {
1172      public $stack = array();
1173  
1174      private $phase;
1175      private $mode;
1176      private $dom;
1177      private $foster_parent = null;
1178      private $a_formatting  = array();
1179  
1180      private $head_pointer = null;
1181      private $form_pointer = null;
1182  
1183      private $scoping = array('button','caption','html','marquee','object','table','td','th');
1184      private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u');
1185      private $special = array('address','area','base','basefont','bgsound',
1186      'blockquote','body','br','center','col','colgroup','dd','dir','div','dl',
1187      'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5',
1188      'h6','head','hr','iframe','image','img','input','isindex','li','link',
1189      'listing','menu','meta','noembed','noframes','noscript','ol','optgroup',
1190      'option','p','param','plaintext','pre','script','select','spacer','style',
1191      'tbody','textarea','tfoot','thead','title','tr','ul','wbr');
1192  
1193      // The different phases.

1194      const INIT_PHASE = 0;
1195      const ROOT_PHASE = 1;
1196      const MAIN_PHASE = 2;
1197      const END_PHASE  = 3;
1198  
1199      // The different insertion modes for the main phase.

1200      const BEFOR_HEAD = 0;
1201      const IN_HEAD    = 1;
1202      const AFTER_HEAD = 2;
1203      const IN_BODY    = 3;
1204      const IN_TABLE   = 4;
1205      const IN_CAPTION = 5;
1206      const IN_CGROUP  = 6;
1207      const IN_TBODY   = 7;
1208      const IN_ROW     = 8;
1209      const IN_CELL    = 9;
1210      const IN_SELECT  = 10;
1211      const AFTER_BODY = 11;
1212      const IN_FRAME   = 12;
1213      const AFTR_FRAME = 13;
1214  
1215      // The different types of elements.

1216      const SPECIAL    = 0;
1217      const SCOPING    = 1;
1218      const FORMATTING = 2;
1219      const PHRASING   = 3;
1220  
1221      const MARKER     = 0;
1222  
1223      public function __construct() {
1224          $this->phase = self::INIT_PHASE;
1225          $this->mode = self::BEFOR_HEAD;
1226          $this->dom = new DOMDocument;
1227  
1228          $this->dom->encoding = 'UTF-8';
1229          $this->dom->preserveWhiteSpace = true;
1230          $this->dom->substituteEntities = true;
1231          $this->dom->strictErrorChecking = false;
1232      }
1233  
1234      // Process tag tokens

1235      public function emitToken($token) {
1236          switch($this->phase) {
1237              case self::INIT_PHASE: return $this->initPhase($token); break;
1238              case self::ROOT_PHASE: return $this->rootElementPhase($token); break;
1239              case self::MAIN_PHASE: return $this->mainPhase($token); break;
1240              case self::END_PHASE : return $this->trailingEndPhase($token); break;
1241          }
1242      }
1243  
1244      private function initPhase($token) {
1245          /* Initially, the tree construction stage must handle each token

1246          emitted from the tokenisation stage as follows: */
1247  
1248          /* A DOCTYPE token that is marked as being in error

1249          A comment token

1250          A start tag token

1251          An end tag token

1252          A character token that is not one of one of U+0009 CHARACTER TABULATION,

1253              U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),

1254              or U+0020 SPACE

1255          An end-of-file token */
1256          if((isset($token['error']) && $token['error']) ||
1257          $token['type'] === HTML5::COMMENT ||
1258          $token['type'] === HTML5::STARTTAG ||
1259          $token['type'] === HTML5::ENDTAG ||
1260          $token['type'] === HTML5::EOF ||
1261          ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1262          !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) {
1263              /* This specification does not define how to handle this case. In

1264              particular, user agents may ignore the entirety of this specification

1265              altogether for such documents, and instead invoke special parse modes

1266              with a greater emphasis on backwards compatibility. */
1267  
1268              $this->phase = self::ROOT_PHASE;
1269              return $this->rootElementPhase($token);
1270  
1271          /* A DOCTYPE token marked as being correct */

1272          } elseif(isset($token['error']) && !$token['error']) {
1273              /* Append a DocumentType node to the Document  node, with the name

1274              attribute set to the name given in the DOCTYPE token (which will be

1275              "HTML"), and the other attributes specific to DocumentType objects

1276              set to null, empty lists, or the empty string as appropriate. */
1277              $doctype = new DOMDocumentType(null, null, 'HTML');
1278  
1279              /* Then, switch to the root element phase of the tree construction

1280              stage. */
1281              $this->phase = self::ROOT_PHASE;
1282  
1283          /* A character token that is one of one of U+0009 CHARACTER TABULATION,

1284          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),

1285          or U+0020 SPACE */
1286          } elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/',
1287          $token['data'])) {
1288              /* Append that character  to the Document node. */

1289              $text = $this->dom->createTextNode($token['data']);
1290              $this->dom->appendChild($text);
1291          }
1292      }
1293  
1294      private function rootElementPhase($token) {
1295          /* After the initial phase, as each token is emitted from the tokenisation

1296          stage, it must be processed as described in this section. */
1297  
1298          /* A DOCTYPE token */

1299          if($token['type'] === HTML5::DOCTYPE) {
1300              // Parse error. Ignore the token.

1301  
1302          /* A comment token */

1303          } elseif($token['type'] === HTML5::COMMENT) {
1304              /* Append a Comment node to the Document object with the data

1305              attribute set to the data given in the comment token. */
1306              $comment = $this->dom->createComment($token['data']);
1307              $this->dom->appendChild($comment);
1308  
1309          /* A character token that is one of one of U+0009 CHARACTER TABULATION,

1310          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),

1311          or U+0020 SPACE */
1312          } elseif($token['type'] === HTML5::CHARACTR &&
1313          preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1314              /* Append that character  to the Document node. */

1315              $text = $this->dom->createTextNode($token['data']);
1316              $this->dom->appendChild($text);
1317  
1318          /* A character token that is not one of U+0009 CHARACTER TABULATION,

1319              U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED

1320              (FF), or U+0020 SPACE

1321          A start tag token

1322          An end tag token

1323          An end-of-file token */
1324          } elseif(($token['type'] === HTML5::CHARACTR &&
1325          !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1326          $token['type'] === HTML5::STARTTAG ||
1327          $token['type'] === HTML5::ENDTAG ||
1328          $token['type'] === HTML5::EOF) {
1329              /* Create an HTMLElement node with the tag name html, in the HTML

1330              namespace. Append it to the Document object. Switch to the main

1331              phase and reprocess the current token. */
1332              $html = $this->dom->createElement('html');
1333              $this->dom->appendChild($html);
1334              $this->stack[] = $html;
1335  
1336              $this->phase = self::MAIN_PHASE;
1337              return $this->mainPhase($token);
1338          }
1339      }
1340  
1341      private function mainPhase($token) {
1342          /* Tokens in the main phase must be handled as follows: */

1343  
1344          /* A DOCTYPE token */

1345          if($token['type'] === HTML5::DOCTYPE) {
1346              // Parse error. Ignore the token.

1347  
1348          /* A start tag token with the tag name "html" */

1349          } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1350              /* If this start tag token was not the first start tag token, then

1351              it is a parse error. */
1352  
1353              /* For each attribute on the token, check to see if the attribute

1354              is already present on the top element of the stack of open elements.

1355              If it is not, add the attribute and its corresponding value to that

1356              element. */
1357              foreach($token['attr'] as $attr) {
1358                  if(!$this->stack[0]->hasAttribute($attr['name'])) {
1359                      $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1360                  }
1361              }
1362  
1363          /* An end-of-file token */

1364          } elseif($token['type'] === HTML5::EOF) {
1365              /* Generate implied end tags. */

1366              $this->generateImpliedEndTags();
1367  
1368          /* Anything else. */

1369          } else {
1370              /* Depends on the insertion mode: */

1371              switch($this->mode) {
1372                  case self::BEFOR_HEAD: return $this->beforeHead($token); break;
1373                  case self::IN_HEAD:    return $this->inHead($token); break;
1374                  case self::AFTER_HEAD: return $this->afterHead($token); break;
1375                  case self::IN_BODY:    return $this->inBody($token); break;
1376                  case self::IN_TABLE:   return $this->inTable($token); break;
1377                  case self::IN_CAPTION: return $this->inCaption($token); break;
1378                  case self::IN_CGROUP:  return $this->inColumnGroup($token); break;
1379                  case self::IN_TBODY:   return $this->inTableBody($token); break;
1380                  case self::IN_ROW:     return $this->inRow($token); break;
1381                  case self::IN_CELL:    return $this->inCell($token); break;
1382                  case self::IN_SELECT:  return $this->inSelect($token); break;
1383                  case self::AFTER_BODY: return $this->afterBody($token); break;
1384                  case self::IN_FRAME:   return $this->inFrameset($token); break;
1385                  case self::AFTR_FRAME: return $this->afterFrameset($token); break;
1386                  case self::END_PHASE:  return $this->trailingEndPhase($token); break;
1387              }
1388          }
1389      }
1390  
1391      private function beforeHead($token) {
1392          /* Handle the token as follows: */

1393  
1394          /* A character token that is one of one of U+0009 CHARACTER TABULATION,

1395          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),

1396          or U+0020 SPACE */
1397          if($token['type'] === HTML5::CHARACTR &&
1398          preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1399              /* Append the character to the current node. */

1400              $this->insertText($token['data']);
1401  
1402          /* A comment token */

1403          } elseif($token['type'] === HTML5::COMMENT) {
1404              /* Append a Comment node to the current node with the data attribute

1405              set to the data given in the comment token. */
1406              $this->insertComment($token['data']);
1407  
1408          /* A start tag token with the tag name "head" */

1409          } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1410              /* Create an element for the token, append the new element to the

1411              current node and push it onto the stack of open elements. */
1412              $element = $this->insertElement($token);
1413  
1414              /* Set the head element pointer to this new element node. */

1415              $this->head_pointer = $element;
1416  
1417              /* Change the insertion mode to "in head". */

1418              $this->mode = self::IN_HEAD;
1419  
1420          /* A start tag token whose tag name is one of: "base", "link", "meta",

1421          "script", "style", "title". Or an end tag with the tag name "html".

1422          Or a character token that is not one of U+0009 CHARACTER TABULATION,

1423          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),

1424          or U+0020 SPACE. Or any other start tag token */
1425          } elseif($token['type'] === HTML5::STARTTAG ||
1426          ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1427          ($token['type'] === HTML5::CHARACTR && !preg_match('/^[\t\n\x0b\x0c ]$/',
1428          $token['data']))) {
1429              /* Act as if a start tag token with the tag name "head" and no

1430              attributes had been seen, then reprocess the current token. */
1431              $this->beforeHead(array(
1432                  'name' => 'head',
1433                  'type' => HTML5::STARTTAG,
1434                  'attr' => array()
1435              ));
1436  
1437              return $this->inHead($token);
1438  
1439          /* Any other end tag */

1440          } elseif($token['type'] === HTML5::ENDTAG) {
1441              /* Parse error. Ignore the token. */

1442          }
1443      }
1444  
1445      private function inHead($token) {
1446          /* Handle the token as follows: */

1447  
1448          /* A character token that is one of one of U+0009 CHARACTER TABULATION,

1449          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),

1450          or U+0020 SPACE.

1451  

1452          THIS DIFFERS FROM THE SPEC: If the current node is either a title, style

1453          or script element, append the character to the current node regardless

1454          of its content. */
1455          if(($token['type'] === HTML5::CHARACTR &&
1456          preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1457          $token['type'] === HTML5::CHARACTR && in_array(end($this->stack)->nodeName,
1458          array('title', 'style', 'script')))) {
1459              /* Append the character to the current node. */

1460              $this->insertText($token['data']);
1461  
1462          /* A comment token */

1463          } elseif($token['type'] === HTML5::COMMENT) {
1464              /* Append a Comment node to the current node with the data attribute

1465              set to the data given in the comment token. */
1466              $this->insertComment($token['data']);
1467  
1468          } elseif($token['type'] === HTML5::ENDTAG &&
1469          in_array($token['name'], array('title', 'style', 'script'))) {
1470              array_pop($this->stack);
1471              return HTML5::PCDATA;
1472  
1473          /* A start tag with the tag name "title" */

1474          } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
1475              /* Create an element for the token and append the new element to the

1476              node pointed to by the head element pointer, or, if that is null

1477              (innerHTML case), to the current node. */
1478              if($this->head_pointer !== null) {
1479                  $element = $this->insertElement($token, false);
1480                  $this->head_pointer->appendChild($element);
1481  
1482              } else {
1483                  $element = $this->insertElement($token);
1484              }
1485  
1486              /* Switch the tokeniser's content model flag  to the RCDATA state. */

1487              return HTML5::RCDATA;
1488  
1489          /* A start tag with the tag name "style" */

1490          } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
1491              /* Create an element for the token and append the new element to the

1492              node pointed to by the head element pointer, or, if that is null

1493              (innerHTML case), to the current node. */
1494              if($this->head_pointer !== null) {
1495                  $element = $this->insertElement($token, false);
1496                  $this->head_pointer->appendChild($element);
1497  
1498              } else {
1499                  $this->insertElement($token);
1500              }
1501  
1502              /* Switch the tokeniser's content model flag  to the CDATA state. */

1503              return HTML5::CDATA;
1504  
1505          /* A start tag with the tag name "script" */

1506          } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
1507              /* Create an element for the token. */

1508              $element = $this->insertElement($token, false);
1509              $this->head_pointer->appendChild($element);
1510  
1511              /* Switch the tokeniser's content model flag  to the CDATA state. */

1512              return HTML5::CDATA;
1513  
1514          /* A start tag with the tag name "base", "link", or "meta" */

1515          } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1516          array('base', 'link', 'meta'))) {
1517              /* Create an element for the token and append the new element to the

1518              node pointed to by the head element pointer, or, if that is null

1519              (innerHTML case), to the current node. */
1520              if($this->head_pointer !== null) {
1521                  $element = $this->insertElement($token, false);
1522                  $this->head_pointer->appendChild($element);
1523                  array_pop($this->stack);
1524  
1525              } else {
1526                  $this->insertElement($token);
1527              }
1528  
1529          /* An end tag with the tag name "head" */

1530          } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
1531              /* If the current node is a head element, pop the current node off

1532              the stack of open elements. */
1533              if($this->head_pointer->isSameNode(end($this->stack))) {
1534                  array_pop($this->stack);
1535  
1536              /* Otherwise, this is a parse error. */

1537              } else {
1538                  // k

1539              }
1540  
1541              /* Change the insertion mode to "after head". */

1542              $this->mode = self::AFTER_HEAD;
1543  
1544          /* A start tag with the tag name "head" or an end tag except "html". */

1545          } elseif(($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
1546          ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')) {
1547              // Parse error. Ignore the token.

1548  
1549          /* Anything else */

1550          } else {
1551              /* If the current node is a head element, act as if an end tag

1552              token with the tag name "head" had been seen. */
1553              if($this->head_pointer->isSameNode(end($this->stack))) {
1554                  $this->inHead(array(
1555                      'name' => 'head',
1556                      'type' => HTML5::ENDTAG
1557                  ));
1558  
1559              /* Otherwise, change the insertion mode to "after head". */

1560              } else {
1561                  $this->mode = self::AFTER_HEAD;
1562              }
1563  
1564              /* Then, reprocess the current token. */

1565              return $this->afterHead($token);
1566          }
1567      }
1568  
1569      private function afterHead($token) {
1570          /* Handle the token as follows: */

1571  
1572          /* A character token that is one of one of U+0009 CHARACTER TABULATION,

1573          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),

1574          or U+0020 SPACE */
1575          if($token['type'] === HTML5::CHARACTR &&
1576          preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1577              /* Append the character to the current node. */

1578              $this->insertText($token['data']);
1579  
1580          /* A comment token */

1581          } elseif($token['type'] === HTML5::COMMENT) {
1582              /* Append a Comment node to the current node with the data attribute

1583              set to the data given in the comment token. */
1584              $this->insertComment($token['data']);
1585  
1586          /* A start tag token with the tag name "body" */

1587          } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
1588              /* Insert a body element for the token. */

1589              $this->insertElement($token);
1590  
1591              /* Change the insertion mode to "in body". */

1592              $this->mode = self::IN_BODY;
1593  
1594          /* A start tag token with the tag name "frameset" */

1595          } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
1596              /* Insert a frameset element for the token. */

1597              $this->insertElement($token);
1598  
1599              /* Change the insertion mode to "in frameset". */

1600              $this->mode = self::IN_FRAME;
1601  
1602          /* A start tag token whose tag name is one of: "base", "link", "meta",

1603          "script", "style", "title" */
1604          } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1605          array('base', 'link', 'meta', 'script', 'style', 'title'))) {
1606              /* Parse error. Switch the insertion mode back to "in head" and

1607              reprocess the token. */
1608              $this->mode = self::IN_HEAD;
1609              return $this->inHead($token);
1610  
1611          /* Anything else */

1612          } else {
1613              /* Act as if a start tag token with the tag name "body" and no

1614              attributes had been seen, and then reprocess the current token. */
1615              $this->afterHead(array(
1616                  'name' => 'body',
1617                  'type' => HTML5::STARTTAG,
1618                  'attr' => array()
1619              ));
1620  
1621              return $this->inBody($token);
1622          }
1623      }
1624  
1625      private function inBody($token) {
1626          /* Handle the token as follows: */

1627  
1628          switch($token['type']) {
1629              /* A character token */

1630              case HTML5::CHARACTR:
1631                  /* Reconstruct the active formatting elements, if any. */

1632                  $this->reconstructActiveFormattingElements();
1633  
1634                  /* Append the token's character to the current node. */

1635                  $this->insertText($token['data']);
1636              break;
1637  
1638              /* A comment token */

1639              case HTML5::COMMENT:
1640                  /* Append a Comment node to the current node with the data

1641                  attribute set to the data given in the comment token. */
1642                  $this->insertComment($token['data']);
1643              break;
1644  
1645              case HTML5::STARTTAG:
1646              switch($token['name']) {
1647                  /* A start tag token whose tag name is one of: "script",

1648                  "style" */
1649                  case 'script': case 'style':
1650                      /* Process the token as if the insertion mode had been "in

1651                      head". */
1652                      return $this->inHead($token);
1653                  break;
1654  
1655                  /* A start tag token whose tag name is one of: "base", "link",

1656                  "meta", "title" */
1657                  case 'base': case 'link': case 'meta': case 'title':
1658                      /* Parse error. Process the token as if the insertion mode

1659                      had    been "in head". */
1660                      return $this->inHead($token);
1661                  break;
1662  
1663                  /* A start tag token with the tag name "body" */

1664                  case 'body':
1665                      /* Parse error. If the second element on the stack of open

1666                      elements is not a body element, or, if the stack of open

1667                      elements has only one node on it, then ignore the token.

1668                      (innerHTML case) */
1669                      if(count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
1670                          // Ignore

1671  
1672                      /* Otherwise, for each attribute on the token, check to see

1673                      if the attribute is already present on the body element (the

1674                      second element)    on the stack of open elements. If it is not,

1675                      add the attribute and its corresponding value to that

1676                      element. */
1677                      } else {
1678                          foreach($token['attr'] as $attr) {
1679                              if(!$this->stack[1]->hasAttribute($attr['name'])) {
1680                                  $this->stack[1]->setAttribute($attr['name'], $attr['value']);
1681                              }
1682                          }
1683                      }
1684                  break;
1685  
1686                  /* A start tag whose tag name is one of: "address",

1687                  "blockquote", "center", "dir", "div", "dl", "fieldset",

1688                  "listing", "menu", "ol", "p", "ul" */
1689                  case 'address': case 'blockquote': case 'center': case 'dir':
1690                  case 'div': case 'dl': case 'fieldset': case 'listing':
1691                  case 'menu': case 'ol': case 'p': case 'ul':
1692                      /* If the stack of open elements has a p element in scope,

1693                      then act as if an end tag with the tag name p had been

1694                      seen. */
1695                      if($this->elementInScope('p')) {
1696                          $this->emitToken(array(
1697                              'name' => 'p',
1698                              'type' => HTML5::ENDTAG
1699                          ));
1700                      }
1701  
1702                      /* Insert an HTML element for the token. */

1703                      $this->insertElement($token);
1704                  break;
1705  
1706                  /* A start tag whose tag name is "form" */

1707                  case 'form':
1708                      /* If the form element pointer is not null, ignore the

1709                      token with a parse error. */
1710                      if($this->form_pointer !== null) {
1711                          // Ignore.

1712  
1713                      /* Otherwise: */

1714                      } else {
1715                          /* If the stack of open elements has a p element in

1716                          scope, then act as if an end tag with the tag name p

1717                          had been seen. */
1718                          if($this->elementInScope('p')) {
1719                              $this->emitToken(array(
1720                                  'name' => 'p',
1721                                  'type' => HTML5::ENDTAG
1722                              ));
1723                          }
1724  
1725                          /* Insert an HTML element for the token, and set the

1726                          form element pointer to point to the element created. */
1727                          $element = $this->insertElement($token);
1728                          $this->form_pointer = $element;
1729                      }
1730                  break;
1731  
1732                  /* A start tag whose tag name is "li", "dd" or "dt" */

1733                  case 'li': case 'dd': case 'dt':
1734                      /* If the stack of open elements has a p  element in scope,

1735                      then act as if an end tag with the tag name p had been

1736                      seen. */
1737                      if($this->elementInScope('p')) {
1738                          $this->emitToken(array(
1739                              'name' => 'p',
1740                              'type' => HTML5::ENDTAG
1741                          ));
1742                      }
1743  
1744                      $stack_length = count($this->stack) - 1;
1745  
1746                      for($n = $stack_length; 0 <= $n; $n--) {
1747                          /* 1. Initialise node to be the current node (the

1748                          bottommost node of the stack). */
1749                          $stop = false;
1750                          $node = $this->stack[$n];
1751                          $cat  = $this->getElementCategory($node->tagName);
1752  
1753                          /* 2. If node is an li, dd or dt element, then pop all

1754                          the    nodes from the current node up to node, including

1755                          node, then stop this algorithm. */
1756                          if($token['name'] === $node->tagName ||    ($token['name'] !== 'li'
1757                          && ($node->tagName === 'dd' || $node->tagName === 'dt'))) {
1758                              for($x = $stack_length; $x >= $n ; $x--) {
1759                                  array_pop($this->stack);
1760                              }
1761  
1762                              break;
1763                          }
1764  
1765                          /* 3. If node is not in the formatting category, and is

1766                          not    in the phrasing category, and is not an address or

1767                          div element, then stop this algorithm. */
1768                          if($cat !== self::FORMATTING && $cat !== self::PHRASING &&
1769                          $node->tagName !== 'address' && $node->tagName !== 'div') {
1770                              break;
1771                          }
1772                      }
1773  
1774                      /* Finally, insert an HTML element with the same tag

1775                      name as the    token's. */
1776                      $this->insertElement($token);
1777                  break;
1778  
1779                  /* A start tag token whose tag name is "plaintext" */

1780                  case 'plaintext':
1781                      /* If the stack of open elements has a p  element in scope,

1782                      then act as if an end tag with the tag name p had been

1783                      seen. */
1784                      if($this->elementInScope('p')) {
1785                          $this->emitToken(array(
1786                              'name' => 'p',
1787                              'type' => HTML5::ENDTAG
1788                          ));
1789                      }
1790  
1791                      /* Insert an HTML element for the token. */

1792                      $this->insertElement($token);
1793  
1794                      return HTML5::PLAINTEXT;
1795                  break;
1796  
1797                  /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",

1798                  "h5", "h6" */
1799                  case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
1800                      /* If the stack of open elements has a p  element in scope,

1801                      then act as if an end tag with the tag name p had been seen. */
1802                      if($this->elementInScope('p')) {
1803                          $this->emitToken(array(
1804                              'name' => 'p',
1805                              'type' => HTML5::ENDTAG
1806                          ));
1807                      }
1808  
1809                      /* If the stack of open elements has in scope an element whose

1810                      tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then

1811                      this is a parse error; pop elements from the stack until an

1812                      element with one of those tag names has been popped from the

1813                      stack. */
1814                      while($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
1815                          array_pop($this->stack);
1816                      }
1817  
1818                      /* Insert an HTML element for the token. */

1819                      $this->insertElement($token);
1820                  break;
1821  
1822                  /* A start tag whose tag name is "a" */

1823                  case 'a':
1824                      /* If the list of active formatting elements contains

1825                      an element whose tag name is "a" between the end of the

1826                      list and the last marker on the list (or the start of

1827                      the list if there is no marker on the list), then this

1828                      is a parse error; act as if an end tag with the tag name

1829                      "a" had been seen, then remove that element from the list

1830                      of active formatting elements and the stack of open

1831                      elements if the end tag didn't already remove it (it

1832                      might not have if the element is not in table scope). */
1833                      $leng = count($this->a_formatting);
1834  
1835                      for($n = $leng - 1; $n >= 0; $n--) {
1836                          if($this->a_formatting[$n] === self::MARKER) {
1837                              break;
1838  
1839                          } elseif($this->a_formatting[$n]->nodeName === 'a') {
1840                              $this->emitToken(array(
1841                                  'name' => 'a',
1842                                  'type' => HTML5::ENDTAG
1843                              ));
1844                              break;
1845                          }
1846                      }
1847  
1848                      /* Reconstruct the active formatting elements, if any. */

1849                      $this->reconstructActiveFormattingElements();
1850  
1851                      /* Insert an HTML element for the token. */

1852                      $el = $this->insertElement($token);
1853  
1854                      /* Add that element to the list of active formatting

1855                      elements. */
1856                      $this->a_formatting[] = $el;
1857                  break;
1858  
1859                  /* A start tag whose tag name is one of: "b", "big", "em", "font",

1860                  "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
1861                  case 'b': case 'big': case 'em': case 'font': case 'i':
1862                  case 'nobr': case 's': case 'small': case 'strike':
1863                  case 'strong': case 'tt': case 'u':
1864                      /* Reconstruct the active formatting elements, if any. */

1865                      $this->reconstructActiveFormattingElements();
1866  
1867                      /* Insert an HTML element for the token. */

1868                      $el = $this->insertElement($token);
1869  
1870                      /* Add that element to the list of active formatting

1871                      elements. */
1872                      $this->a_formatting[] = $el;
1873                  break;
1874  
1875                  /* A start tag token whose tag name is "button" */

1876                  case 'button':
1877                      /* If the stack of open elements has a button element in scope,

1878                      then this is a parse error; act as if an end tag with the tag

1879                      name "button" had been seen, then reprocess the token. (We don't

1880                      do that. Unnecessary.) */
1881                      if($this->elementInScope('button')) {
1882                          $this->inBody(array(
1883                              'name' => 'button',
1884                              'type' => HTML5::ENDTAG
1885                          ));
1886                      }
1887  
1888                      /* Reconstruct the active formatting elements, if any. */

1889                      $this->reconstructActiveFormattingElements();
1890  
1891                      /* Insert an HTML element for the token. */

1892                      $this->insertElement($token);
1893  
1894                      /* Insert a marker at the end of the list of active

1895                      formatting elements. */
1896                      $this->a_formatting[] = self::MARKER;
1897                  break;
1898  
1899                  /* A start tag token whose tag name is one of: "marquee", "object" */

1900                  case 'marquee': case 'object':
1901                      /* Reconstruct the active formatting elements, if any. */

1902                      $this->reconstructActiveFormattingElements();
1903  
1904                      /* Insert an HTML element for the token. */

1905                      $this->insertElement($token);
1906  
1907                      /* Insert a marker at the end of the list of active

1908                      formatting elements. */
1909                      $this->a_formatting[] = self::MARKER;
1910                  break;
1911  
1912                  /* A start tag token whose tag name is "xmp" */

1913                  case 'xmp':
1914                      /* Reconstruct the active formatting elements, if any. */

1915                      $this->reconstructActiveFormattingElements();
1916  
1917                      /* Insert an HTML element for the token. */

1918                      $this->insertElement($token);
1919  
1920                      /* Switch the content model flag to the CDATA state. */

1921                      return HTML5::CDATA;
1922                  break;
1923  
1924                  /* A start tag whose tag name is "table" */

1925                  case 'table':
1926                      /* If the stack of open elements has a p element in scope,

1927                      then act as if an end tag with the tag name p had been seen. */
1928                      if($this->elementInScope('p')) {
1929                          $this->emitToken(array(
1930                              'name' => 'p',
1931                              'type' => HTML5::ENDTAG
1932                          ));
1933                      }
1934  
1935                      /* Insert an HTML element for the token. */

1936                      $this->insertElement($token);
1937  
1938                      /* Change the insertion mode to "in table". */

1939                      $this->mode = self::IN_TABLE;
1940                  break;
1941  
1942                  /* A start tag whose tag name is one of: "area", "basefont",

1943                  "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
1944                  case 'area': case 'basefont': case 'bgsound': case 'br':
1945                  case 'embed': case 'img': case 'param': case 'spacer':
1946                  case 'wbr':
1947                      /* Reconstruct the active formatting elements, if any. */

1948                      $this->reconstructActiveFormattingElements();
1949  
1950                      /* Insert an HTML element for the token. */

1951                      $this->insertElement($token);
1952  
1953                      /* Immediately pop the current node off the stack of open elements. */

1954                      array_pop($this->stack);
1955                  break;
1956  
1957                  /* A start tag whose tag name is "hr" */

1958                  case 'hr':
1959                      /* If the stack of open elements has a p element in scope,

1960                      then act as if an end tag with the tag name p had been seen. */
1961                      if($this->elementInScope('p')) {
1962                          $this->emitToken(array(
1963                              'name' => 'p',
1964                              'type' => HTML5::ENDTAG
1965                          ));
1966                      }
1967  
1968                      /* Insert an HTML element for the token. */

1969                      $this->insertElement($token);
1970  
1971                      /* Immediately pop the current node off the stack of open elements. */

1972                      array_pop($this->stack);
1973                  break;
1974  
1975                  /* A start tag whose tag name is "image" */

1976                  case 'image':
1977                      /* Parse error. Change the token's tag name to "img" and

1978                      reprocess it. (Don't ask.) */
1979                      $token['name'] = 'img';
1980                      return $this->inBody($token);
1981                  break;
1982  
1983                  /* A start tag whose tag name is "input" */

1984                  case 'input':
1985                      /* Reconstruct the active formatting elements, if any. */

1986                      $this->reconstructActiveFormattingElements();
1987  
1988                      /* Insert an input element for the token. */

1989                      $element = $this->insertElement($token, false);
1990  
1991                      /* If the form element pointer is not null, then associate the

1992                      input element with the form element pointed to by the form

1993                      element pointer. */
1994                      $this->form_pointer !== null
1995                          ? $this->form_pointer->appendChild($element)
1996                          : end($this->stack)->appendChild($element);
1997  
1998                      /* Pop that input element off the stack of open elements. */

1999                      array_pop($this->stack);
2000                  break;
2001  
2002                  /* A start tag whose tag name is "isindex" */

2003                  case 'isindex':
2004                      /* Parse error. */

2005                      // w/e

2006  
2007                      /* If the form element pointer is not null,

2008                      then ignore the token. */
2009                      if($this->form_pointer === null) {
2010                          /* Act as if a start tag token with the tag name "form" had

2011                          been seen. */
2012                          $this->inBody(array(
2013                              'name' => 'body',
2014                              'type' => HTML5::STARTTAG,
2015                              'attr' => array()
2016                          ));
2017  
2018                          /* Act as if a start tag token with the tag name "hr" had

2019                          been seen. */
2020                          $this->inBody(array(
2021                              'name' => 'hr',
2022                              'type' => HTML5::STARTTAG,
2023                              'attr' => array()
2024                          ));
2025  
2026                          /* Act as if a start tag token with the tag name "p" had

2027                          been seen. */
2028                          $this->inBody(array(
2029                              'name' => 'p',
2030                              'type' => HTML5::STARTTAG,
2031                              'attr' => array()
2032                          ));
2033  
2034                          /* Act as if a start tag token with the tag name "label"

2035                          had been seen. */
2036                          $this->inBody(array(
2037                              'name' => 'label',
2038                              'type' => HTML5::STARTTAG,
2039                              'attr' => array()
2040                          ));
2041  
2042                          /* Act as if a stream of character tokens had been seen. */

2043                          $this->insertText('This is a searchable index. '.
2044                          'Insert your search keywords here: ');
2045  
2046                          /* Act as if a start tag token with the tag name "input"

2047                          had been seen, with all the attributes from the "isindex"

2048                          token, except with the "name" attribute set to the value

2049                          "isindex" (ignoring any explicit "name" attribute). */
2050                          $attr = $token['attr'];
2051                          $attr[] = array('name' => 'name', 'value' => 'isindex');
2052  
2053                          $this->inBody(array(
2054                              'name' => 'input',
2055                              'type' => HTML5::STARTTAG,
2056                              'attr' => $attr
2057                          ));
2058  
2059                          /* Act as if a stream of character tokens had been seen

2060                          (see below for what they should say). */
2061                          $this->insertText('This is a searchable index. '.
2062                          'Insert your search keywords here: ');
2063  
2064                          /* Act as if an end tag token with the tag name "label"

2065                          had been seen. */
2066                          $this->inBody(array(
2067                              'name' => 'label',
2068                              'type' => HTML5::ENDTAG
2069                          ));
2070  
2071                          /* Act as if an end tag token with the tag name "p" had

2072                          been seen. */
2073                          $this->inBody(array(
2074                              'name' => 'p',
2075                              'type' => HTML5::ENDTAG
2076                          ));
2077  
2078                          /* Act as if a start tag token with the tag name "hr" had

2079                          been seen. */
2080                          $this->inBody(array(
2081                              'name' => 'hr',
2082                              'type' => HTML5::ENDTAG
2083                          ));
2084  
2085                          /* Act as if an end tag token with the tag name "form" had

2086                          been seen. */
2087                          $this->inBody(array(
2088                              'name' => 'form',
2089                              'type' => HTML5::ENDTAG
2090                          ));
2091                      }
2092                  break;
2093  
2094                  /* A start tag whose tag name is "textarea" */

2095                  case 'textarea':
2096                      $this->insertElement($token);
2097  
2098                      /* Switch the tokeniser's content model flag to the

2099                      RCDATA state. */
2100                      return HTML5::RCDATA;
2101                  break;
2102  
2103                  /* A start tag whose tag name is one of: "iframe", "noembed",

2104                  "noframes" */
2105                  case 'iframe': case 'noembed': case 'noframes':
2106                      $this->insertElement($token);
2107  
2108                      /* Switch the tokeniser's content model flag to the CDATA state. */

2109                      return HTML5::CDATA;
2110                  break;
2111  
2112                  /* A start tag whose tag name is "select" */

2113                  case 'select':
2114                      /* Reconstruct the active formatting elements, if any. */

2115                      $this->reconstructActiveFormattingElements();
2116  
2117                      /* Insert an HTML element for the token. */

2118                      $this->insertElement($token);
2119  
2120                      /* Change the insertion mode to "in select". */

2121                      $this->mode = self::IN_SELECT;
2122                  break;
2123  
2124                  /* A start or end tag whose tag name is one of: "caption", "col",

2125                  "colgroup", "frame", "frameset", "head", "option", "optgroup",

2126                  "tbody", "td", "tfoot", "th", "thead", "tr". */
2127                  case 'caption': case 'col': case 'colgroup': case 'frame':
2128                  case 'frameset': case 'head': case 'option': case 'optgroup':
2129                  case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead':
2130                  case 'tr':
2131                      // Parse error. Ignore the token.

2132                  break;
2133  
2134                  /* A start or end tag whose tag name is one of: "event-source",

2135                  "section", "nav", "article", "aside", "header", "footer",

2136                  "datagrid", "command" */
2137                  case 'event-source': case 'section': case 'nav': case 'article':
2138                  case 'aside': case 'header': case 'footer': case 'datagrid':
2139                  case 'command':
2140                      // Work in progress!

2141                  break;
2142  
2143                  /* A start tag token not covered by the previous entries */

2144                  default:
2145                      /* Reconstruct the active formatting elements, if any. */

2146                      $this->reconstructActiveFormattingElements();
2147  
2148                      $this->insertElement($token);
2149                  break;
2150              }
2151              break;
2152  
2153              case HTML5::ENDTAG:
2154              switch($token['name']) {
2155                  /* An end tag with the tag name "body" */

2156                  case 'body':
2157                      /* If the second element in the stack of open elements is

2158                      not a body element, this is a parse error. Ignore the token.

2159                      (innerHTML case) */
2160                      if(count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2161                          // Ignore.

2162  
2163                      /* If the current node is not the body element, then this

2164                      is a parse error. */
2165                      } elseif(end($this->stack)->nodeName !== 'body') {
2166                          // Parse error.

2167                      }
2168  
2169                      /* Change the insertion mode to "after body". */

2170                      $this->mode = self::AFTER_BODY;
2171                  break;
2172  
2173                  /* An end tag with the tag name "html" */

2174                  case 'html':
2175                      /* Act as if an end tag with tag name "body" had been seen,

2176                      then, if that token wasn't ignored, reprocess the current

2177                      token. */
2178                      $this->inBody(array(
2179                          'name' => 'body',
2180                          'type' => HTML5::ENDTAG
2181                      ));
2182  
2183                      return $this->afterBody($token);
2184                  break;
2185  
2186                  /* An end tag whose tag name is one of: "address", "blockquote",

2187                  "center", "dir", "div", "dl", "fieldset", "listing", "menu",

2188                  "ol", "pre", "ul" */
2189                  case 'address': case 'blockquote': case 'center': case 'dir':
2190                  case 'div': case 'dl': case 'fieldset': case 'listing':
2191                  case 'menu': case 'ol': case 'pre': case 'ul':
2192                      /* If the stack of open elements has an element in scope

2193                      with the same tag name as that of the token, then generate

2194                      implied end tags. */
2195                      if($this->elementInScope($token['name'])) {
2196                          $this->generateImpliedEndTags();
2197  
2198                          /* Now, if the current node is not an element with

2199                          the same tag name as that of the token, then this

2200                          is a parse error. */
2201                          // w/e

2202  
2203                          /* If the stack of open elements has an element in

2204                          scope with the same tag name as that of the token,

2205                          then pop elements from this stack until an element

2206                          with that tag name has been popped from the stack. */
2207                          for($n = count($this->stack) - 1; $n >= 0; $n--) {
2208                              if($this->stack[$n]->nodeName === $token['name']) {
2209                                  $n = -1;
2210                              }
2211  
2212                              array_pop($this->stack);
2213                          }
2214                      }
2215                  break;
2216  
2217                  /* An end tag whose tag name is "form" */

2218                  case 'form':
2219                      /* If the stack of open elements has an element in scope

2220                      with the same tag name as that of the token, then generate

2221                      implied    end tags. */
2222                      if($this->elementInScope($token['name'])) {
2223                          $this->generateImpliedEndTags();
2224  
2225                      } 
2226  
2227                      if(end($this->stack)->nodeName !== $token['name']) {
2228                          /* Now, if the current node is not an element with the

2229                          same tag name as that of the token, then this is a parse

2230                          error. */
2231                          // w/e

2232  
2233                      } else {
2234                          /* Otherwise, if the current node is an element with

2235                          the same tag name as that of the token pop that element

2236                          from the stack. */
2237                          array_pop($this->stack);
2238                      }
2239  
2240                      /* In any case, set the form element pointer to null. */

2241                      $this->form_pointer = null;
2242                  break;
2243  
2244                  /* An end tag whose tag name is "p" */

2245                  case 'p':
2246                      /* If the stack of open elements has a p element in scope,

2247                      then generate implied end tags, except for p elements. */
2248                      if($this->elementInScope('p')) {
2249                          $this->generateImpliedEndTags(array('p'));
2250  
2251                          /* If the current node is not a p element, then this is

2252                          a parse error. */
2253                          // k

2254  
2255                          /* If the stack of open elements has a p element in

2256                          scope, then pop elements from this stack until the stack

2257                          no longer has a p element in scope. */
2258                          for($n = count($this->stack) - 1; $n >= 0; $n--) {
2259                              if($this->elementInScope('p')) {
2260                                  array_pop($this->stack);
2261  
2262                              } else {
2263                                  break;
2264                              }
2265                          }
2266                      }
2267                  break;
2268  
2269                  /* An end tag whose tag name is "dd", "dt", or "li" */

2270                  case 'dd': case 'dt': case 'li':
2271                      /* If the stack of open elements has an element in scope

2272                      whose tag name matches the tag name of the token, then

2273                      generate implied end tags, except for elements with the

2274                      same tag name as the token. */
2275                      if($this->elementInScope($token['name'])) {
2276                          $this->generateImpliedEndTags(array($token['name']));
2277  
2278                          /* If the current node is not an element with the same

2279                          tag name as the token, then this is a parse error. */
2280                          // w/e

2281  
2282                          /* If the stack of open elements has an element in scope

2283                          whose tag name matches the tag name of the token, then

2284                          pop elements from this stack until an element with that

2285                          tag name has been popped from the stack. */
2286                          for($n = count($this->stack) - 1; $n >= 0; $n--) {
2287                              if($this->stack[$n]->nodeName === $token['name']) {
2288                                  $n = -1;
2289                              }
2290  
2291                              array_pop($this->stack);
2292                          }
2293                      }
2294                  break;
2295  
2296                  /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",

2297                  "h5", "h6" */
2298                  case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
2299                      $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2300  
2301                      /* If the stack of open elements has in scope an element whose

2302                      tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then

2303                      generate implied end tags. */
2304                      if($this->elementInScope($elements)) {
2305                          $this->generateImpliedEndTags();
2306  
2307                          /* Now, if the current node is not an element with the same

2308                          tag name as that of the token, then this is a parse error. */
2309                          // w/e

2310  
2311                          /* If the stack of open elements has in scope an element

2312                          whose tag name is one of "h1", "h2", "h3", "h4", "h5", or

2313                          "h6", then pop elements from the stack until an element

2314                          with one of those tag names has been popped from the stack. */
2315                          while($this->elementInScope($elements)) {
2316                              array_pop($this->stack);
2317                          }
2318                      }
2319                  break;
2320  
2321                  /* An end tag whose tag name is one of: "a", "b", "big", "em",

2322                  "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2323                  case 'a': case 'b': case 'big': case 'em': case 'font':
2324                  case 'i': case 'nobr': case 's': case 'small': case 'strike':
2325                  case 'strong': case 'tt': case 'u':
2326                      /* 1. Let the formatting element be the last element in

2327                      the list of active formatting elements that:

2328                          * is between the end of the list and the last scope

2329                          marker in the list, if any, or the start of the list

2330