[ Index ]

PHP Cross Reference of Moodle 1.9.3 [Build 15-Oct-2008]

title

Body

[close]

/lib/htmlpurifier/HTMLPurifier/ -> Lexer.php (source)

   1  <?php
   2  
   3  require_once 'HTMLPurifier/Token.php';
   4  require_once  'HTMLPurifier/Encoder.php';
   5  require_once 'HTMLPurifier/EntityParser.php';
   6  
   7  // implementations
   8  require_once 'HTMLPurifier/Lexer/DirectLex.php';
   9  if (version_compare(PHP_VERSION, "5", ">=")) {
  10      // You can remove the if statement if you are running PHP 5 only.
  11      // We ought to get the strict version to follow those rules.
  12      require_once 'HTMLPurifier/Lexer/DOMLex.php';
  13  }
  14  
  15  HTMLPurifier_ConfigSchema::define(
  16      'Core', 'ConvertDocumentToFragment', true, 'bool', '
  17  This parameter determines whether or not the filter should convert
  18  input that is a full document with html and body tags to a fragment
  19  of just the contents of a body tag. This parameter is simply something
  20  HTML Purifier can do during an edge-case: for most inputs, this
  21  processing is not necessary.
  22  ');
  23  HTMLPurifier_ConfigSchema::defineAlias('Core', 'AcceptFullDocuments', 'Core', 'ConvertDocumentToFragment');
  24  
  25  HTMLPurifier_ConfigSchema::define(
  26      'Core', 'LexerImpl', null, 'mixed/null', '
  27  <p>
  28    This parameter determines what lexer implementation can be used. The
  29    valid values are:
  30  </p>
  31  <dl>
  32    <dt><em>null</em></dt>
  33    <dd>
  34      Recommended, the lexer implementation will be auto-detected based on
  35      your PHP-version and configuration.
  36    </dd>
  37    <dt><em>string</em> lexer identifier</dt>
  38    <dd>
  39      This is a slim way of manually overridding the implementation.
  40      Currently recognized values are: DOMLex (the default PHP5 implementation)
  41      and DirectLex (the default PHP4 implementation). Only use this if
  42      you know what you are doing: usually, the auto-detection will
  43      manage things for cases you aren\'t even aware of.
  44    </dd>
  45    <dt><em>object</em> lexer instance</dt>
  46    <dd>
  47      Super-advanced: you can specify your own, custom, implementation that
  48      implements the interface defined by <code>HTMLPurifier_Lexer</code>.
  49      I may remove this option simply because I don\'t expect anyone
  50      to use it.
  51    </dd>
  52  </dl>
  53  <p>
  54    This directive has been available since 2.0.0.
  55  </p>
  56  '
  57  );
  58  
  59  HTMLPurifier_ConfigSchema::define(
  60      'Core', 'MaintainLineNumbers', null, 'bool/null', '
  61  <p>
  62    If true, HTML Purifier will add line number information to all tokens.
  63    This is useful when error reporting is turned on, but can result in
  64    significant performance degradation and should not be used when
  65    unnecessary. This directive must be used with the DirectLex lexer,
  66    as the DOMLex lexer does not (yet) support this functionality. 
  67    If the value is null, an appropriate value will be selected based
  68    on other configuration. This directive has been available since 2.0.0.
  69  </p>
  70  ');
  71  
  72  HTMLPurifier_ConfigSchema::define(
  73      'Core', 'AggressivelyFixLt', false, 'bool', '
  74  This directive enables aggressive pre-filter fixes HTML Purifier can
  75  perform in order to ensure that open angled-brackets do not get killed
  76  during parsing stage. Enabling this will result in two preg_replace_callback
  77  calls and one preg_replace call for every bit of HTML passed through here.
  78  It is not necessary and will have no effect for PHP 4.
  79  This directive has been available since 2.1.0.
  80  ');
  81  
  82  /**
  83   * Forgivingly lexes HTML (SGML-style) markup into tokens.
  84   * 
  85   * A lexer parses a string of SGML-style markup and converts them into
  86   * corresponding tokens.  It doesn't check for well-formedness, although its
  87   * internal mechanism may make this automatic (such as the case of
  88   * HTMLPurifier_Lexer_DOMLex).  There are several implementations to choose
  89   * from.
  90   * 
  91   * A lexer is HTML-oriented: it might work with XML, but it's not
  92   * recommended, as we adhere to a subset of the specification for optimization
  93   * reasons.
  94   * 
  95   * This class should not be directly instantiated, but you may use create() to
  96   * retrieve a default copy of the lexer.  Being a supertype, this class
  97   * does not actually define any implementation, but offers commonly used
  98   * convenience functions for subclasses.
  99   * 
 100   * @note The unit tests will instantiate this class for testing purposes, as
 101   *       many of the utility functions require a class to be instantiated.
 102   *       Be careful when porting this class to PHP 5.
 103   * 
 104   * @par
 105   * 
 106   * @note
 107   * We use tokens rather than create a DOM representation because DOM would:
 108   * 
 109   * @par
 110   *  -# Require more processing power to create,
 111   *  -# Require recursion to iterate,
 112   *  -# Must be compatible with PHP 5's DOM (otherwise duplication),
 113   *  -# Has the entire document structure (html and body not needed), and
 114   *  -# Has unknown readability improvement.
 115   * 
 116   * @par
 117   * What the last item means is that the functions for manipulating tokens are
 118   * already fairly compact, and when well-commented, more abstraction may not
 119   * be needed.
 120   * 
 121   * @see HTMLPurifier_Token
 122   */
 123  class HTMLPurifier_Lexer
 124  {
 125      
 126      // -- STATIC ----------------------------------------------------------
 127      
 128      /**
 129       * Retrieves or sets the default Lexer as a Prototype Factory.
 130       * 
 131       * Depending on what PHP version you are running, the abstract base
 132       * Lexer class will determine which concrete Lexer is best for you:
 133       * HTMLPurifier_Lexer_DirectLex for PHP 4, and HTMLPurifier_Lexer_DOMLex
 134       * for PHP 5 and beyond.  This general rule has a few exceptions to it
 135       * involving special features that only DirectLex implements.
 136       * 
 137       * @static
 138       * 
 139       * @note The behavior of this class has changed, rather than accepting
 140       *       a prototype object, it now accepts a configuration object.
 141       *       To specify your own prototype, set %Core.LexerImpl to it.
 142       *       This change in behavior de-singletonizes the lexer object.
 143       * 
 144       * @note In PHP4, it is possible to call this factory method from 
 145       *       subclasses, such usage is not recommended and not
 146       *       forwards-compatible.
 147       * 
 148       * @param $prototype Optional prototype lexer or configuration object
 149       * @return Concrete lexer.
 150       */
 151      function create($config) {
 152          
 153          if (!is_a($config, 'HTMLPurifier_Config')) {
 154              $lexer = $config;
 155              trigger_error("Passing a prototype to 
 156                HTMLPurifier_Lexer::create() is deprecated, please instead
 157                use %Core.LexerImpl", E_USER_WARNING);
 158          } else {
 159              $lexer = $config->get('Core', 'LexerImpl');
 160          }
 161          
 162          if (is_object($lexer)) {
 163              return $lexer;
 164          }
 165          
 166          if (is_null($lexer)) { do {
 167              // auto-detection algorithm
 168              
 169              // once PHP DOM implements native line numbers, or we
 170              // hack out something using XSLT, remove this stipulation
 171              $line_numbers = $config->get('Core', 'MaintainLineNumbers');
 172              if (
 173                  $line_numbers === true ||
 174                  ($line_numbers === null && $config->get('Core', 'CollectErrors'))
 175              ) {
 176                  $lexer = 'DirectLex';
 177                  break;
 178              }
 179              
 180              if (version_compare(PHP_VERSION, "5", ">=") && // check for PHP5
 181                  class_exists('DOMDocument')) { // check for DOM support
 182                  $lexer = 'DOMLex';
 183              } else {
 184                  $lexer = 'DirectLex';
 185              }
 186              
 187          } while(0); } // do..while so we can break
 188          
 189          // instantiate recognized string names
 190          switch ($lexer) {
 191              case 'DOMLex':
 192                  return new HTMLPurifier_Lexer_DOMLex();
 193              case 'DirectLex':
 194                  return new HTMLPurifier_Lexer_DirectLex();
 195              case 'PH5P':
 196                  // experimental Lexer that must be manually included
 197                  return new HTMLPurifier_Lexer_PH5P();
 198              default:
 199                  trigger_error("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer), E_USER_ERROR);
 200          }
 201          
 202      }
 203      
 204      // -- CONVENIENCE MEMBERS ---------------------------------------------
 205      
 206      function HTMLPurifier_Lexer() {
 207          $this->_entity_parser = new HTMLPurifier_EntityParser();
 208      }
 209      
 210      /**
 211       * Most common entity to raw value conversion table for special entities.
 212       * @protected
 213       */
 214      var $_special_entity2str =
 215              array(
 216                      '&quot;' => '"',
 217                      '&amp;'  => '&',
 218                      '&lt;'   => '<',
 219                      '&gt;'   => '>',
 220                      '&#39;'  => "'",
 221                      '&#039;' => "'",
 222                      '&#x27;' => "'"
 223              );
 224      
 225      /**
 226       * Parses special entities into the proper characters.
 227       * 
 228       * This string will translate escaped versions of the special characters
 229       * into the correct ones.
 230       * 
 231       * @warning
 232       * You should be able to treat the output of this function as
 233       * completely parsed, but that's only because all other entities should
 234       * have been handled previously in substituteNonSpecialEntities()
 235       * 
 236       * @param $string String character data to be parsed.
 237       * @returns Parsed character data.
 238       */
 239      function parseData($string) {
 240          
 241          // following functions require at least one character
 242          if ($string === '') return '';
 243          
 244          // subtracts amps that cannot possibly be escaped
 245          $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
 246              ($string[strlen($string)-1] === '&' ? 1 : 0);
 247          
 248          if (!$num_amp) return $string; // abort if no entities
 249          $num_esc_amp = substr_count($string, '&amp;');
 250          $string = strtr($string, $this->_special_entity2str);
 251          
 252          // code duplication for sake of optimization, see above
 253          $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
 254              ($string[strlen($string)-1] === '&' ? 1 : 0);
 255          
 256          if ($num_amp_2 <= $num_esc_amp) return $string;
 257          
 258          // hmm... now we have some uncommon entities. Use the callback.
 259          $string = $this->_entity_parser->substituteSpecialEntities($string);
 260          return $string;
 261      }
 262      
 263      /**
 264       * Lexes an HTML string into tokens.
 265       * 
 266       * @param $string String HTML.
 267       * @return HTMLPurifier_Token array representation of HTML.
 268       */
 269      function tokenizeHTML($string, $config, &$context) {
 270          trigger_error('Call to abstract class', E_USER_ERROR);
 271      }
 272      
 273      /**
 274       * Translates CDATA sections into regular sections (through escaping).
 275       * 
 276       * @static
 277       * @protected
 278       * @param $string HTML string to process.
 279       * @returns HTML with CDATA sections escaped.
 280       */
 281      function escapeCDATA($string) {
 282          return preg_replace_callback(
 283              '/<!\[CDATA\[(.+?)\]\]>/s',
 284              array('HTMLPurifier_Lexer', 'CDATACallback'),
 285              $string
 286          );
 287      }
 288      
 289      /**
 290       * Special CDATA case that is especiall convoluted for <script>
 291       */
 292      function escapeCommentedCDATA($string) {
 293          return preg_replace_callback(
 294              '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
 295              array('HTMLPurifier_Lexer', 'CDATACallback'),
 296              $string
 297          );
 298      }
 299      
 300      /**
 301       * Callback function for escapeCDATA() that does the work.
 302       * 
 303       * @static
 304       * @warning Though this is public in order to let the callback happen,
 305       *          calling it directly is not recommended.
 306       * @params $matches PCRE matches array, with index 0 the entire match
 307       *                  and 1 the inside of the CDATA section.
 308       * @returns Escaped internals of the CDATA section.
 309       */
 310      function CDATACallback($matches) {
 311          // not exactly sure why the character set is needed, but whatever
 312          return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
 313      }
 314      
 315      /**
 316       * Takes a piece of HTML and normalizes it by converting entities, fixing
 317       * encoding, extracting bits, and other good stuff.
 318       */
 319      function normalize($html, $config, &$context) {
 320          
 321          // extract body from document if applicable
 322          if ($config->get('Core', 'ConvertDocumentToFragment')) {
 323              $html = $this->extractBody($html);
 324          }
 325          
 326          // normalize newlines to \n
 327          $html = str_replace("\r\n", "\n", $html);
 328          $html = str_replace("\r", "\n", $html);
 329          
 330          if ($config->get('HTML', 'Trusted')) {
 331              // escape convoluted CDATA
 332              $html = $this->escapeCommentedCDATA($html);
 333          }
 334          
 335          // escape CDATA
 336          $html = $this->escapeCDATA($html);
 337          
 338          // expand entities that aren't the big five
 339          $html = $this->_entity_parser->substituteNonSpecialEntities($html);
 340          
 341          // clean into wellformed UTF-8 string for an SGML context: this has
 342          // to be done after entity expansion because the entities sometimes
 343          // represent non-SGML characters (horror, horror!)
 344          $html = HTMLPurifier_Encoder::cleanUTF8($html);
 345          
 346          return $html;
 347      }
 348      
 349      /**
 350       * Takes a string of HTML (fragment or document) and returns the content
 351       */
 352      function extractBody($html) {
 353          $matches = array();
 354          $result = preg_match('!<body[^>]*>(.+?)</body>!is', $html, $matches);
 355          if ($result) {
 356              return $matches[1];
 357          } else {
 358              return $html;
 359          }
 360      }
 361      
 362  }
 363  


Generated: Wed Jan 14 11:33:29 2009 Cross-referenced by PHPXref 0.7