| [ Index ] |
PHP Cross Reference of Moodle 1.9.3 [Build 15-Oct-2008] |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * Zend Framework 4 * 5 * LICENSE 6 * 7 * This source file is subject to the new BSD license that is bundled 8 * with this package in the file LICENSE.txt. 9 * It is also available through the world-wide-web at this URL: 10 * http://framework.zend.com/license/new-bsd 11 * If you did not receive a copy of the license and are unable to 12 * obtain it through the world-wide-web, please send an email 13 * to license@zend.com so we can send you a copy immediately. 14 * 15 * @category Zend 16 * @package Zend_Search_Lucene 17 * @subpackage Document 18 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) 19 * @license http://framework.zend.com/license/new-bsd New BSD License 20 */ 21 22 23 /** Zend_Search_Lucene_Document */ 24 require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Document.php'; 25 26 27 /** 28 * HTML document. 29 * 30 * @category Zend 31 * @package Zend_Search_Lucene 32 * @subpackage Document 33 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) 34 * @license http://framework.zend.com/license/new-bsd New BSD License 35 */ 36 class Zend_Search_Lucene_Document_Html extends Zend_Search_Lucene_Document 37 { 38 /** 39 * List of document links 40 * 41 * @var array 42 */ 43 private $_links = array(); 44 45 /** 46 * List of document header links 47 * 48 * @var array 49 */ 50 private $_headerLinks = array(); 51 52 /** 53 * Stored DOM representation 54 * 55 * @var DOMDocument 56 */ 57 private $_doc; 58 59 /** 60 * Object constructor 61 * 62 * @param string $data 63 * @param boolean $isFile 64 * @param boolean $storeContent 65 */ 66 private function __construct($data, $isFile, $storeContent) 67 { 68 $this->_doc = new DOMDocument(); 69 $this->_doc->substituteEntities = true; 70 71 if ($isFile) { 72 @$this->_doc->loadHTMLFile($data); 73 } else{ 74 @$this->_doc->loadHTML($data); 75 } 76 77 $xpath = new DOMXPath($this->_doc); 78 79 $docTitle = ''; 80 $titleNodes = $xpath->query('/html/head/title'); 81 foreach ($titleNodes as $titleNode) { 82 // title should always have only one entry, but we process all nodeset entries 83 $docTitle .= $titleNode->nodeValue . ' '; 84 } 85 $this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, $this->_doc->actualEncoding)); 86 87 $metaNodes = $xpath->query('/html/head/meta[@name]'); 88 foreach ($metaNodes as $metaNode) { 89 $this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'), 90 $metaNode->getAttribute('content'), 91 $this->_doc->actualEncoding)); 92 } 93 94 $docBody = ''; 95 $bodyNodes = $xpath->query('/html/body'); 96 foreach ($bodyNodes as $bodyNode) { 97 // body should always have only one entry, but we process all nodeset entries 98 $this->_retrieveNodeText($bodyNode, $docBody); 99 } 100 if ($storeContent) { 101 $this->addField(Zend_Search_Lucene_Field::Text('body', $docBody, $this->_doc->actualEncoding)); 102 } else { 103 $this->addField(Zend_Search_Lucene_Field::UnStored('body', $docBody, $this->_doc->actualEncoding)); 104 } 105 106 $linkNodes = $this->_doc->getElementsByTagName('a'); 107 foreach ($linkNodes as $linkNode) { 108 if (($href = $linkNode->getAttribute('href')) != '') { 109 $this->_links[] = $href; 110 } 111 } 112 $this->_links = array_unique($this->_links); 113 114 $linkNodes = $xpath->query('/html/head/link'); 115 foreach ($linkNodes as $linkNode) { 116 if (($href = $linkNode->getAttribute('href')) != '') { 117 $this->_headerLinks[] = $href; 118 } 119 } 120 $this->_headerLinks = array_unique($this->_headerLinks); 121 } 122 123 /** 124 * Get node text 125 * 126 * We should exclude scripts, which may be not included into comment tags, CDATA sections, 127 * 128 * @param DOMNode $node 129 * @param string &$text 130 */ 131 private function _retrieveNodeText(DOMNode $node, &$text) 132 { 133 if ($node->nodeType == XML_TEXT_NODE) { 134 $text .= $node->nodeValue ; 135 $text .= ' '; 136 } else if ($node->nodeType == XML_ELEMENT_NODE && $node->nodeName != 'script') { 137 foreach ($node->childNodes as $childNode) { 138 $this->_retrieveNodeText($childNode, $text); 139 } 140 } 141 } 142 143 /** 144 * Get document HREF links 145 * 146 * @return array 147 */ 148 public function getLinks() 149 { 150 return $this->_links; 151 } 152 153 /** 154 * Get document header links 155 * 156 * @return array 157 */ 158 public function getHeaderLinks() 159 { 160 return $this->_headerLinks; 161 } 162 163 /** 164 * Load HTML document from a string 165 * 166 * @param string $data 167 * @param boolean $storeContent 168 * @return Zend_Search_Lucene_Document_Html 169 */ 170 public static function loadHTML($data, $storeContent = false) 171 { 172 return new Zend_Search_Lucene_Document_Html($data, false, $storeContent); 173 } 174 175 /** 176 * Load HTML document from a file 177 * 178 * @param string $file 179 * @param boolean $storeContent 180 * @return Zend_Search_Lucene_Document_Html 181 */ 182 public static function loadHTMLFile($file, $storeContent = false) 183 { 184 return new Zend_Search_Lucene_Document_Html($file, true, $storeContent); 185 } 186 187 188 /** 189 * Highlight text in text node 190 * 191 * @param DOMText $node 192 * @param array $wordsToHighlight 193 * @param string $color 194 */ 195 public function _highlightTextNode(DOMText $node, $wordsToHighlight, $color) 196 { 197 $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); 198 $analyzer->setInput($node->nodeValue, $this->_doc->encoding); 199 200 $matchedTokens = array(); 201 202 while (($token = $analyzer->nextToken()) !== null) { 203 if (isset($wordsToHighlight[$token->getTermText()])) { 204 $matchedTokens[] = $token; 205 } 206 } 207 208 if (count($matchedTokens) == 0) { 209 return; 210 } 211 212 $matchedTokens = array_reverse($matchedTokens); 213 214 foreach ($matchedTokens as $token) { 215 // Cut text after matched token 216 $node->splitText($token->getEndOffset()); 217 218 // Cut matched node 219 $matchedWordNode = $node->splitText($token->getStartOffset()); 220 221 $highlightedNode = $this->_doc->createElement('b', $matchedWordNode->nodeValue); 222 $highlightedNode->setAttribute('style', 'color:black;background-color:' . $color); 223 224 $node->parentNode->replaceChild($highlightedNode, $matchedWordNode); 225 } 226 } 227 228 229 /** 230 * highlight words in content of the specified node 231 * 232 * @param DOMNode $contextNode 233 * @param array $wordsToHighlight 234 * @param string $color 235 */ 236 public function _highlightNode(DOMNode $contextNode, $wordsToHighlight, $color) 237 { 238 $textNodes = array(); 239 240 if (!$contextNode->hasChildNodes()) { 241 return; 242 } 243 244 foreach ($contextNode->childNodes as $childNode) { 245 if ($childNode->nodeType == XML_TEXT_NODE) { 246 // process node later to leave childNodes structure untouched 247 $textNodes[] = $childNode; 248 } else { 249 // Skip script nodes 250 if ($childNode->nodeName != 'script') { 251 $this->_highlightNode($childNode, $wordsToHighlight, $color); 252 } 253 } 254 } 255 256 foreach ($textNodes as $textNode) { 257 $this->_highlightTextNode($textNode, $wordsToHighlight, $color); 258 } 259 } 260 261 262 263 /** 264 * Highlight text with specified color 265 * 266 * @param string|array $words 267 * @param string $color 268 * @return string 269 */ 270 public function highlight($words, $color = '#66ffff') 271 { 272 if (!is_array($words)) { 273 $words = array($words); 274 } 275 $wordsToHighlight = array(); 276 277 $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); 278 foreach ($words as $wordString) { 279 $wordsToHighlight = array_merge($wordsToHighlight, $analyzer->tokenize($wordString)); 280 } 281 282 if (count($wordsToHighlight) == 0) { 283 return $this->_doc->saveHTML(); 284 } 285 286 $wordsToHighlightFlipped = array(); 287 foreach ($wordsToHighlight as $id => $token) { 288 $wordsToHighlightFlipped[$token->getTermText()] = $id; 289 } 290 291 $xpath = new DOMXPath($this->_doc); 292 293 $matchedNodes = $xpath->query("/html/body/*"); 294 foreach ($matchedNodes as $matchedNode) { 295 $this->_highlightNode($matchedNode, $wordsToHighlightFlipped, $color); 296 } 297 298 } 299 300 /** 301 * Get HTML 302 * 303 * @return string 304 */ 305 public function getHTML() 306 { 307 return $this->_doc->saveHTML(); 308 } 309 } 310
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
| Generated: Wed Jan 14 11:33:29 2009 | Cross-referenced by PHPXref 0.7 |