[ Index ]

PHP Cross Reference of Moodle 1.9.3 [Build 15-Oct-2008]

title

Body

[close]

/search/Zend/Search/Lucene/Document/ -> Html.php (source)

   1  <?php
   2  /**
   3   * Zend Framework
   4   *
   5   * LICENSE
   6   *
   7   * This source file is subject to the new BSD license that is bundled
   8   * with this package in the file LICENSE.txt.
   9   * It is also available through the world-wide-web at this URL:
  10   * http://framework.zend.com/license/new-bsd
  11   * If you did not receive a copy of the license and are unable to
  12   * obtain it through the world-wide-web, please send an email
  13   * to license@zend.com so we can send you a copy immediately.
  14   *
  15   * @category   Zend
  16   * @package    Zend_Search_Lucene
  17   * @subpackage Document
  18   * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  19   * @license    http://framework.zend.com/license/new-bsd     New BSD License
  20   */
  21  
  22  
  23  /** Zend_Search_Lucene_Document */
  24  require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Document.php';
  25  
  26  
  27  /**
  28   * HTML document.
  29   *
  30   * @category   Zend
  31   * @package    Zend_Search_Lucene
  32   * @subpackage Document
  33   * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  34   * @license    http://framework.zend.com/license/new-bsd     New BSD License
  35   */
  36  class Zend_Search_Lucene_Document_Html extends Zend_Search_Lucene_Document
  37  {
  38      /**
  39       * List of document links
  40       *
  41       * @var array
  42       */
  43      private $_links = array();
  44  
  45      /**
  46       * List of document header links
  47       *
  48       * @var array
  49       */
  50      private $_headerLinks = array();
  51  
  52      /**
  53       * Stored DOM representation
  54       *
  55       * @var DOMDocument
  56       */
  57      private $_doc;
  58  
  59      /**
  60       * Object constructor
  61       *
  62       * @param string  $data
  63       * @param boolean $isFile
  64       * @param boolean $storeContent
  65       */
  66      private function __construct($data, $isFile, $storeContent)
  67      {
  68          $this->_doc = new DOMDocument();
  69          $this->_doc->substituteEntities = true;
  70  
  71          if ($isFile) {
  72              @$this->_doc->loadHTMLFile($data);
  73          } else{
  74              @$this->_doc->loadHTML($data);
  75          }
  76  
  77          $xpath = new DOMXPath($this->_doc);
  78  
  79          $docTitle = '';
  80          $titleNodes = $xpath->query('/html/head/title');
  81          foreach ($titleNodes as $titleNode) {
  82              // title should always have only one entry, but we process all nodeset entries
  83              $docTitle .= $titleNode->nodeValue . ' ';
  84          }
  85          $this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, $this->_doc->actualEncoding));
  86  
  87          $metaNodes = $xpath->query('/html/head/meta[@name]');
  88          foreach ($metaNodes as $metaNode) {
  89              $this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'),
  90                                                             $metaNode->getAttribute('content'),
  91                                                             $this->_doc->actualEncoding));
  92          }
  93  
  94          $docBody = '';
  95          $bodyNodes = $xpath->query('/html/body');
  96          foreach ($bodyNodes as $bodyNode) {
  97              // body should always have only one entry, but we process all nodeset entries
  98              $this->_retrieveNodeText($bodyNode, $docBody);
  99          }
 100          if ($storeContent) {
 101              $this->addField(Zend_Search_Lucene_Field::Text('body', $docBody, $this->_doc->actualEncoding));
 102          } else {
 103              $this->addField(Zend_Search_Lucene_Field::UnStored('body', $docBody, $this->_doc->actualEncoding));
 104          }
 105  
 106          $linkNodes = $this->_doc->getElementsByTagName('a');
 107          foreach ($linkNodes as $linkNode) {
 108              if (($href = $linkNode->getAttribute('href')) != '') {
 109                  $this->_links[] = $href;
 110              }
 111          }
 112          $this->_links = array_unique($this->_links);
 113  
 114          $linkNodes = $xpath->query('/html/head/link');
 115          foreach ($linkNodes as $linkNode) {
 116              if (($href = $linkNode->getAttribute('href')) != '') {
 117                  $this->_headerLinks[] = $href;
 118              }
 119          }
 120          $this->_headerLinks = array_unique($this->_headerLinks);
 121      }
 122  
 123      /**
 124       * Get node text
 125       *
 126       * We should exclude scripts, which may be not included into comment tags, CDATA sections,
 127       *
 128       * @param DOMNode $node
 129       * @param string &$text
 130       */
 131      private function _retrieveNodeText(DOMNode $node, &$text)
 132      {
 133          if ($node->nodeType == XML_TEXT_NODE) {
 134              $text .= $node->nodeValue ;
 135              $text .= ' ';
 136          } else if ($node->nodeType == XML_ELEMENT_NODE  &&  $node->nodeName != 'script') {
 137              foreach ($node->childNodes as $childNode) {
 138                  $this->_retrieveNodeText($childNode, $text);
 139              }
 140          }
 141      }
 142  
 143      /**
 144       * Get document HREF links
 145       *
 146       * @return array
 147       */
 148      public function getLinks()
 149      {
 150          return $this->_links;
 151      }
 152  
 153      /**
 154       * Get document header links
 155       *
 156       * @return array
 157       */
 158      public function getHeaderLinks()
 159      {
 160          return $this->_headerLinks;
 161      }
 162  
 163      /**
 164       * Load HTML document from a string
 165       *
 166       * @param string $data
 167       * @param boolean $storeContent
 168       * @return Zend_Search_Lucene_Document_Html
 169       */
 170      public static function loadHTML($data, $storeContent = false)
 171      {
 172          return new Zend_Search_Lucene_Document_Html($data, false, $storeContent);
 173      }
 174  
 175      /**
 176       * Load HTML document from a file
 177       *
 178       * @param string $file
 179       * @param boolean $storeContent
 180       * @return Zend_Search_Lucene_Document_Html
 181       */
 182      public static function loadHTMLFile($file, $storeContent = false)
 183      {
 184          return new Zend_Search_Lucene_Document_Html($file, true, $storeContent);
 185      }
 186  
 187  
 188      /**
 189       * Highlight text in text node
 190       *
 191       * @param DOMText $node
 192       * @param array   $wordsToHighlight
 193       * @param string  $color
 194       */
 195      public function _highlightTextNode(DOMText $node, $wordsToHighlight, $color)
 196      {
 197          $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
 198          $analyzer->setInput($node->nodeValue, $this->_doc->encoding);
 199  
 200          $matchedTokens = array();
 201  
 202          while (($token = $analyzer->nextToken()) !== null) {
 203              if (isset($wordsToHighlight[$token->getTermText()])) {
 204                  $matchedTokens[] = $token;
 205              }
 206          }
 207  
 208          if (count($matchedTokens) == 0) {
 209              return;
 210          }
 211  
 212          $matchedTokens = array_reverse($matchedTokens);
 213  
 214          foreach ($matchedTokens as $token) {
 215              // Cut text after matched token
 216              $node->splitText($token->getEndOffset());
 217  
 218              // Cut matched node
 219              $matchedWordNode = $node->splitText($token->getStartOffset());
 220  
 221              $highlightedNode = $this->_doc->createElement('b', $matchedWordNode->nodeValue);
 222              $highlightedNode->setAttribute('style', 'color:black;background-color:' . $color);
 223  
 224              $node->parentNode->replaceChild($highlightedNode, $matchedWordNode);
 225          }
 226      }
 227  
 228  
 229      /**
 230       * highlight words in content of the specified node
 231       *
 232       * @param DOMNode $contextNode
 233       * @param array $wordsToHighlight
 234       * @param string $color
 235       */
 236      public function _highlightNode(DOMNode $contextNode, $wordsToHighlight, $color)
 237      {
 238          $textNodes = array();
 239  
 240          if (!$contextNode->hasChildNodes()) {
 241              return;
 242          }
 243  
 244          foreach ($contextNode->childNodes as $childNode) {
 245              if ($childNode->nodeType == XML_TEXT_NODE) {
 246                  // process node later to leave childNodes structure untouched
 247                  $textNodes[] = $childNode;
 248              } else {
 249                  // Skip script nodes
 250                  if ($childNode->nodeName != 'script') {
 251                      $this->_highlightNode($childNode, $wordsToHighlight, $color);
 252                  }
 253              }
 254          }
 255  
 256          foreach ($textNodes as $textNode) {
 257              $this->_highlightTextNode($textNode, $wordsToHighlight, $color);
 258          }
 259      }
 260  
 261  
 262  
 263      /**
 264       * Highlight text with specified color
 265       *
 266       * @param string|array $words
 267       * @param string $color
 268       * @return string
 269       */
 270      public function highlight($words, $color = '#66ffff')
 271      {
 272          if (!is_array($words)) {
 273              $words = array($words);
 274          }
 275          $wordsToHighlight = array();
 276  
 277          $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
 278          foreach ($words as $wordString) {
 279              $wordsToHighlight = array_merge($wordsToHighlight, $analyzer->tokenize($wordString));
 280          }
 281  
 282          if (count($wordsToHighlight) == 0) {
 283              return $this->_doc->saveHTML();
 284          }
 285  
 286          $wordsToHighlightFlipped = array();
 287          foreach ($wordsToHighlight as $id => $token) {
 288              $wordsToHighlightFlipped[$token->getTermText()] = $id;
 289          }
 290  
 291          $xpath = new DOMXPath($this->_doc);
 292  
 293          $matchedNodes = $xpath->query("/html/body/*");
 294          foreach ($matchedNodes as $matchedNode) {
 295              $this->_highlightNode($matchedNode, $wordsToHighlightFlipped, $color);
 296          }
 297  
 298      }
 299  
 300      /**
 301       * Get HTML
 302       *
 303       * @return string
 304       */
 305      public function getHTML()
 306      {
 307          return $this->_doc->saveHTML();
 308      }
 309  }
 310  


Generated: Wed Jan 14 11:33:29 2009 Cross-referenced by PHPXref 0.7