| [ Index ] |
PHP Cross Reference of Moodle 1.9.3 [Build 15-Oct-2008] |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * Zend Framework 4 * 5 * LICENSE 6 * 7 * This source file is subject to the new BSD license that is bundled 8 * with this package in the file LICENSE.txt. 9 * It is also available through the world-wide-web at this URL: 10 * http://framework.zend.com/license/new-bsd 11 * If you did not receive a copy of the license and are unable to 12 * obtain it through the world-wide-web, please send an email 13 * to license@zend.com so we can send you a copy immediately. 14 * 15 * @category Zend 16 * @package Zend_Search_Lucene 17 * @subpackage Search 18 * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) 19 * @license http://framework.zend.com/license/new-bsd New BSD License 20 */ 21 22 23 /** Zend_Search_Lucene_Search_Query */ 24 require_once 'Zend/Search/Lucene/Search/Query.php'; 25 26 /** Zend_Search_Lucene_Search_Query_MultiTerm */ 27 require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php'; 28 29 30 /** 31 * @category Zend 32 * @package Zend_Search_Lucene 33 * @subpackage Search 34 * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) 35 * @license http://framework.zend.com/license/new-bsd New BSD License 36 */ 37 class Zend_Search_Lucene_Search_Query_Fuzzy extends Zend_Search_Lucene_Search_Query 38 { 39 /** Default minimum similarity */ 40 const DEFAULT_MIN_SIMILARITY = 0.5; 41 42 /** 43 * Maximum number of matched terms. 44 * Apache Lucene defines this limitation as boolean query maximum number of clauses: 45 * org.apache.lucene.search.BooleanQuery.getMaxClauseCount() 46 */ 47 const MAX_CLAUSE_COUNT = 1024; 48 49 /** 50 * Array of precalculated max distances 51 * 52 * keys are integers representing a word size 53 */ 54 private $_maxDistances = array(); 55 56 /** 57 * Base searching term. 58 * 59 * @var Zend_Search_Lucene_Index_Term 60 */ 61 private $_term; 62 63 /** 64 * A value between 0 and 1 to set the required similarity 65 * between the query term and the matching terms. For example, for a 66 * _minimumSimilarity of 0.5 a term of the same length 67 * as the query term is considered similar to the query term if the edit distance 68 * between both terms is less than length(term)*0.5 69 * 70 * @var float 71 */ 72 private $_minimumSimilarity; 73 74 /** 75 * The length of common (non-fuzzy) prefix 76 * 77 * @var integer 78 */ 79 private $_prefixLength; 80 81 /** 82 * Matched terms. 83 * 84 * Matched terms list. 85 * It's filled during the search (rewrite operation) and may be used for search result 86 * post-processing 87 * 88 * Array of Zend_Search_Lucene_Index_Term objects 89 * 90 * @var array 91 */ 92 private $_matches = null; 93 94 /** 95 * Matched terms scores 96 * 97 * @var array 98 */ 99 private $_scores = null; 100 101 /** 102 * Array of the term keys. 103 * Used to sort terms in alphabetical order if terms have the same socres 104 * 105 * @var array 106 */ 107 private $_termKeys = null; 108 109 /** 110 * Zend_Search_Lucene_Search_Query_Wildcard constructor. 111 * 112 * @param Zend_Search_Lucene_Index_Term $pattern 113 * @throws Zend_Search_Lucene_Exception 114 */ 115 public function __construct(Zend_Search_Lucene_Index_Term $term, $minimumSimilarity = self::DEFAULT_MIN_SIMILARITY, $prefixLength = 0) 116 { 117 if ($minimumSimilarity < 0) { 118 throw new Zend_Search_Lucene_Exception('minimumSimilarity cannot be less than 0'); 119 } 120 if ($minimumSimilarity >= 1) { 121 throw new Zend_Search_Lucene_Exception('minimumSimilarity cannot be greater than or equal to 1'); 122 } 123 if ($prefixLength < 0) { 124 throw new Zend_Search_Lucene_Exception('prefixLength cannot be less than 0'); 125 } 126 127 $this->_term = $term; 128 $this->_minimumSimilarity = $minimumSimilarity; 129 $this->_prefixLength = $prefixLength; 130 } 131 132 /** 133 * Calculate maximum distance for specified word length 134 * 135 * @param integer $prefixLength 136 * @param integer $termLength 137 * @param integer $length 138 * @return integer 139 */ 140 private function _calculateMaxDistance($prefixLength, $termLength, $length) 141 { 142 $this->_maxDistances[$length] = (int) ((1 - $this->_minimumSimilarity)*(min($termLength, $length) + $prefixLength)); 143 return $this->_maxDistances[$length]; 144 } 145 146 /** 147 * Re-write query into primitive queries in the context of specified index 148 * 149 * @param Zend_Search_Lucene_Interface $index 150 * @return Zend_Search_Lucene_Search_Query 151 */ 152 public function rewrite(Zend_Search_Lucene_Interface $index) 153 { 154 $this->_matches = array(); 155 $this->_scores = array(); 156 $this->_termKeys = array(); 157 158 if ($this->_term->field === null) { 159 // Search through all fields 160 $fields = $index->getFieldNames(true /* indexed fields list */); 161 } else { 162 $fields = array($this->_term->field); 163 } 164 165 $prefix = Zend_Search_Lucene_Index_Term::getPrefix($this->_term->text, $this->_prefixLength); 166 $prefixByteLength = strlen($prefix); 167 $prefixUtf8Length = Zend_Search_Lucene_Index_Term::getLength($prefix); 168 169 $termLength = Zend_Search_Lucene_Index_Term::getLength($this->_term->text); 170 171 $termRest = substr($this->_term->text, $prefixByteLength); 172 // we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible 173 $termRestLength = strlen($termRest); 174 175 $scaleFactor = 1/(1 - $this->_minimumSimilarity); 176 177 foreach ($fields as $field) { 178 $index->resetTermsStream(); 179 180 if ($prefix != '') { 181 $index->skipTo(new Zend_Search_Lucene_Index_Term($prefix, $field)); 182 183 while ($index->currentTerm() !== null && 184 $index->currentTerm()->field == $field && 185 substr($index->currentTerm()->text, 0, $prefixByteLength) == $prefix) { 186 // Calculate similarity 187 $target = substr($index->currentTerm()->text, $prefixByteLength); 188 189 $maxDistance = isset($this->_maxDistances[strlen($target)])? 190 $this->_maxDistances[strlen($target)] : 191 $this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target)); 192 193 if ($termRestLength == 0) { 194 // we don't have anything to compare. That means if we just add 195 // the letters for current term we get the new word 196 $similarity = (($prefixUtf8Length == 0)? 0 : 1 - strlen($target)/$prefixUtf8Length); 197 } else if (strlen($target) == 0) { 198 $similarity = (($prefixUtf8Length == 0)? 0 : 1 - $termRestLength/$prefixUtf8Length); 199 } else if ($maxDistance < abs($termRestLength - strlen($target))){ 200 //just adding the characters of term to target or vice-versa results in too many edits 201 //for example "pre" length is 3 and "prefixes" length is 8. We can see that 202 //given this optimal circumstance, the edit distance cannot be less than 5. 203 //which is 8-3 or more precisesly abs(3-8). 204 //if our maximum edit distance is 4, then we can discard this word 205 //without looking at it. 206 $similarity = 0; 207 } else { 208 $similarity = 1 - levenshtein($termRest, $target)/($prefixUtf8Length + min($termRestLength, strlen($target))); 209 } 210 211 if ($similarity > $this->_minimumSimilarity) { 212 $this->_matches[] = $index->currentTerm(); 213 $this->_termKeys[] = $index->currentTerm()->key(); 214 $this->_scores[] = ($similarity - $this->_minimumSimilarity)*$scaleFactor; 215 } 216 217 $index->nextTerm(); 218 } 219 } else { 220 $index->skipTo(new Zend_Search_Lucene_Index_Term('', $field)); 221 222 while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) { 223 // Calculate similarity 224 $target = $index->currentTerm()->text; 225 226 $maxDistance = isset($this->_maxDistances[strlen($target)])? 227 $this->_maxDistances[strlen($target)] : 228 $this->_calculateMaxDistance(0, $termRestLength, strlen($target)); 229 230 if ($maxDistance < abs($termRestLength - strlen($target))){ 231 //just adding the characters of term to target or vice-versa results in too many edits 232 //for example "pre" length is 3 and "prefixes" length is 8. We can see that 233 //given this optimal circumstance, the edit distance cannot be less than 5. 234 //which is 8-3 or more precisesly abs(3-8). 235 //if our maximum edit distance is 4, then we can discard this word 236 //without looking at it. 237 $similarity = 0; 238 } else { 239 $similarity = 1 - levenshtein($termRest, $target)/min($termRestLength, strlen($target)); 240 } 241 242 if ($similarity > $this->_minimumSimilarity) { 243 $this->_matches[] = $index->currentTerm(); 244 $this->_termKeys[] = $index->currentTerm()->key(); 245 $this->_scores[] = ($similarity - $this->_minimumSimilarity)*$scaleFactor; 246 } 247 248 $index->nextTerm(); 249 } 250 } 251 252 $index->closeTermsStream(); 253 } 254 255 if (count($this->_matches) == 0) { 256 return new Zend_Search_Lucene_Search_Query_Empty(); 257 } else if (count($this->_matches) == 1) { 258 return new Zend_Search_Lucene_Search_Query_Term(reset($this->_matches)); 259 } else { 260 $rewrittenQuery = new Zend_Search_Lucene_Search_Query_Boolean(); 261 262 array_multisort($this->_scores, SORT_DESC, SORT_NUMERIC, 263 $this->_termKeys, SORT_ASC, SORT_STRING, 264 $this->_matches); 265 266 $termCount = 0; 267 foreach ($this->_matches as $id => $matchedTerm) { 268 $subquery = new Zend_Search_Lucene_Search_Query_Term($matchedTerm); 269 $subquery->setBoost($this->_scores[$id]); 270 271 $rewrittenQuery->addSubquery($subquery); 272 273 $termCount++; 274 if ($termCount >= self::MAX_CLAUSE_COUNT) { 275 break; 276 } 277 } 278 279 return $rewrittenQuery; 280 } 281 } 282 283 /** 284 * Optimize query in the context of specified index 285 * 286 * @param Zend_Search_Lucene_Interface $index 287 * @return Zend_Search_Lucene_Search_Query 288 */ 289 public function optimize(Zend_Search_Lucene_Interface $index) 290 { 291 throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)'); 292 } 293 294 /** 295 * Return query terms 296 * 297 * @return array 298 * @throws Zend_Search_Lucene_Exception 299 */ 300 public function getQueryTerms() 301 { 302 if ($this->_matches === null) { 303 throw new Zend_Search_Lucene_Exception('Search has to be performed first to get matched terms'); 304 } 305 306 return $this->_matches; 307 } 308 309 /** 310 * Constructs an appropriate Weight implementation for this query. 311 * 312 * @param Zend_Search_Lucene_Interface $reader 313 * @return Zend_Search_Lucene_Search_Weight 314 * @throws Zend_Search_Lucene_Exception 315 */ 316 public function createWeight(Zend_Search_Lucene_Interface $reader) 317 { 318 throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)'); 319 } 320 321 322 /** 323 * Execute query in context of index reader 324 * It also initializes necessary internal structures 325 * 326 * @param Zend_Search_Lucene_Interface $reader 327 * @throws Zend_Search_Lucene_Exception 328 */ 329 public function execute(Zend_Search_Lucene_Interface $reader) 330 { 331 throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)'); 332 } 333 334 /** 335 * Get document ids likely matching the query 336 * 337 * It's an array with document ids as keys (performance considerations) 338 * 339 * @return array 340 * @throws Zend_Search_Lucene_Exception 341 */ 342 public function matchedDocs() 343 { 344 throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)'); 345 } 346 347 /** 348 * Score specified document 349 * 350 * @param integer $docId 351 * @param Zend_Search_Lucene_Interface $reader 352 * @return float 353 * @throws Zend_Search_Lucene_Exception 354 */ 355 public function score($docId, Zend_Search_Lucene_Interface $reader) 356 { 357 throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)'); 358 } 359 360 /** 361 * Highlight query terms 362 * 363 * @param integer &$colorIndex 364 * @param Zend_Search_Lucene_Document_Html $doc 365 */ 366 public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex) 367 { 368 $words = array(); 369 370 foreach ($this->_matches as $term) { 371 $words[] = $term->text; 372 } 373 374 $doc->highlight($words, $this->_getHighlightColor($colorIndex)); 375 } 376 377 /** 378 * Print a query 379 * 380 * @return string 381 */ 382 public function __toString() 383 { 384 // It's used only for query visualisation, so we don't care about characters escaping 385 return (($this->_term->field === null)? '' : $this->_term->field . ':') 386 . $this->_term->text . '~' 387 . (($this->_minimumSimilarity != self::DEFAULT_MIN_SIMILARITY)? round($this->_minimumSimilarity, 4) : ''); 388 } 389 } 390
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
| Generated: Wed Jan 14 11:33:29 2009 | Cross-referenced by PHPXref 0.7 |