| [ Index ] |
PHP Cross Reference of Moodle 1.9.3 [Build 15-Oct-2008] |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * Zend Framework 4 * 5 * LICENSE 6 * 7 * This source file is subject to the new BSD license that is bundled 8 * with this package in the file LICENSE.txt. 9 * It is also available through the world-wide-web at this URL: 10 * http://framework.zend.com/license/new-bsd 11 * If you did not receive a copy of the license and are unable to 12 * obtain it through the world-wide-web, please send an email 13 * to license@zend.com so we can send you a copy immediately. 14 * 15 * @category Zend 16 * @package Zend_Search_Lucene 17 * @subpackage Index 18 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) 19 * @license http://framework.zend.com/license/new-bsd New BSD License 20 */ 21 22 23 /** Zend_Search_Lucene_Exception */ 24 require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; 25 26 /** Zend_Search_Lucene_Index_SegmentInfo */ 27 require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php'; 28 29 30 /** 31 * @category Zend 32 * @package Zend_Search_Lucene 33 * @subpackage Index 34 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) 35 * @license http://framework.zend.com/license/new-bsd New BSD License 36 */ 37 abstract class Zend_Search_Lucene_Index_SegmentWriter 38 { 39 /** 40 * Expert: The fraction of terms in the "dictionary" which should be stored 41 * in RAM. Smaller values use more memory, but make searching slightly 42 * faster, while larger values use less memory and make searching slightly 43 * slower. Searching is typically not dominated by dictionary lookup, so 44 * tweaking this is rarely useful. 45 * 46 * @var integer 47 */ 48 public static $indexInterval = 128; 49 50 /** Expert: The fraction of TermDocs entries stored in skip tables. 51 * Larger values result in smaller indexes, greater acceleration, but fewer 52 * accelerable cases, while smaller values result in bigger indexes, 53 * less acceleration and more 54 * accelerable cases. More detailed experiments would be useful here. 55 * 56 * 0x0x7FFFFFFF indicates that we don't use skip data 57 * Default value is 16 58 * 59 * @var integer 60 */ 61 public static $skipInterval = 0x7FFFFFFF; 62 63 /** 64 * Number of docs in a segment 65 * 66 * @var integer 67 */ 68 protected $_docCount = 0; 69 70 /** 71 * Segment name 72 * 73 * @var string 74 */ 75 protected $_name; 76 77 /** 78 * File system adapter. 79 * 80 * @var Zend_Search_Lucene_Storage_Directory 81 */ 82 protected $_directory; 83 84 /** 85 * List of the index files. 86 * Used for automatic compound file generation 87 * 88 * @var unknown_type 89 */ 90 protected $_files = array(); 91 92 /** 93 * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment 94 * 95 * @var array 96 */ 97 protected $_fields = array(); 98 99 /** 100 * Normalization factors. 101 * An array fieldName => normVector 102 * normVector is a binary string. 103 * Each byte corresponds to an indexed document in a segment and 104 * encodes normalization factor (float value, encoded by 105 * Zend_Search_Lucene_Search_Similarity::encodeNorm()) 106 * 107 * @var array 108 */ 109 protected $_norms = array(); 110 111 112 /** 113 * '.fdx' file - Stored Fields, the field index. 114 * 115 * @var Zend_Search_Lucene_Storage_File 116 */ 117 protected $_fdxFile = null; 118 119 /** 120 * '.fdt' file - Stored Fields, the field data. 121 * 122 * @var Zend_Search_Lucene_Storage_File 123 */ 124 protected $_fdtFile = null; 125 126 127 /** 128 * Object constructor. 129 * 130 * @param Zend_Search_Lucene_Storage_Directory $directory 131 * @param string $name 132 */ 133 public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name) 134 { 135 $this->_directory = $directory; 136 $this->_name = $name; 137 } 138 139 140 /** 141 * Add field to the segment 142 * 143 * Returns actual field number 144 * 145 * @param Zend_Search_Lucene_Field $field 146 * @return integer 147 */ 148 public function addField(Zend_Search_Lucene_Field $field) 149 { 150 if (!isset($this->_fields[$field->name])) { 151 $fieldNumber = count($this->_fields); 152 $this->_fields[$field->name] = 153 new Zend_Search_Lucene_Index_FieldInfo($field->name, 154 $field->isIndexed, 155 $fieldNumber, 156 $field->storeTermVector); 157 158 return $fieldNumber; 159 } else { 160 $this->_fields[$field->name]->isIndexed |= $field->isIndexed; 161 $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector; 162 163 return $this->_fields[$field->name]->number; 164 } 165 } 166 167 /** 168 * Add fieldInfo to the segment 169 * 170 * Returns actual field number 171 * 172 * @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo 173 * @return integer 174 */ 175 public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo) 176 { 177 if (!isset($this->_fields[$fieldInfo->name])) { 178 $fieldNumber = count($this->_fields); 179 $this->_fields[$fieldInfo->name] = 180 new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name, 181 $fieldInfo->isIndexed, 182 $fieldNumber, 183 $fieldInfo->storeTermVector); 184 185 return $fieldNumber; 186 } else { 187 $this->_fields[$fieldInfo->name]->isIndexed |= $fieldInfo->isIndexed; 188 $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector; 189 190 return $this->_fields[$fieldInfo->name]->number; 191 } 192 } 193 194 /** 195 * Returns array of FieldInfo objects. 196 * 197 * @return array 198 */ 199 public function getFieldInfos() 200 { 201 return $this->_fields; 202 } 203 204 /** 205 * Add stored fields information 206 * 207 * @param array $storedFields array of Zend_Search_Lucene_Field objects 208 */ 209 public function addStoredFields($storedFields) 210 { 211 if (!isset($this->_fdxFile)) { 212 $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx'); 213 $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); 214 215 $this->_files[] = $this->_name . '.fdx'; 216 $this->_files[] = $this->_name . '.fdt'; 217 } 218 219 $this->_fdxFile->writeLong($this->_fdtFile->tell()); 220 $this->_fdtFile->writeVInt(count($storedFields)); 221 foreach ($storedFields as $field) { 222 $this->_fdtFile->writeVInt($this->_fields[$field->name]->number); 223 $fieldBits = ($field->isTokenized ? 0x01 : 0x00) | 224 ($field->isBinary ? 0x02 : 0x00) | 225 0x00; /* 0x04 - third bit, compressed (ZLIB) */ 226 $this->_fdtFile->writeByte($fieldBits); 227 if ($field->isBinary) { 228 $this->_fdtFile->writeVInt(strlen($field->value)); 229 $this->_fdtFile->writeBytes($field->value); 230 } else { 231 $this->_fdtFile->writeString($field->getUtf8Value()); 232 } 233 } 234 235 $this->_docCount++; 236 } 237 238 /** 239 * Returns the total number of documents in this segment. 240 * 241 * @return integer 242 */ 243 public function count() 244 { 245 return $this->_docCount; 246 } 247 248 /** 249 * Dump Field Info (.fnm) segment file 250 */ 251 protected function _dumpFNM() 252 { 253 $fnmFile = $this->_directory->createFile($this->_name . '.fnm'); 254 $fnmFile->writeVInt(count($this->_fields)); 255 256 foreach ($this->_fields as $field) { 257 $fnmFile->writeString($field->name); 258 $fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) | 259 ($field->storeTermVector ? 0x02 : 0x00) 260 // not supported yet 0x04 /* term positions are stored with the term vectors */ | 261 // not supported yet 0x08 /* term offsets are stored with the term vectors */ | 262 ); 263 264 if ($field->isIndexed) { 265 $normFileName = $this->_name . '.f' . $field->number; 266 $fFile = $this->_directory->createFile($normFileName); 267 $fFile->writeBytes($this->_norms[$field->name]); 268 $this->_files[] = $normFileName; 269 } 270 } 271 272 $this->_files[] = $this->_name . '.fnm'; 273 } 274 275 276 277 /** 278 * Term Dictionary file 279 * 280 * @var Zend_Search_Lucene_Storage_File 281 */ 282 private $_tisFile = null; 283 284 /** 285 * Term Dictionary index file 286 * 287 * @var Zend_Search_Lucene_Storage_File 288 */ 289 private $_tiiFile = null; 290 291 /** 292 * Frequencies file 293 * 294 * @var Zend_Search_Lucene_Storage_File 295 */ 296 private $_frqFile = null; 297 298 /** 299 * Positions file 300 * 301 * @var Zend_Search_Lucene_Storage_File 302 */ 303 private $_prxFile = null; 304 305 /** 306 * Number of written terms 307 * 308 * @var integer 309 */ 310 private $_termCount; 311 312 313 /** 314 * Last saved term 315 * 316 * @var Zend_Search_Lucene_Index_Term 317 */ 318 private $_prevTerm; 319 320 /** 321 * Last saved term info 322 * 323 * @var Zend_Search_Lucene_Index_TermInfo 324 */ 325 private $_prevTermInfo; 326 327 /** 328 * Last saved index term 329 * 330 * @var Zend_Search_Lucene_Index_Term 331 */ 332 private $_prevIndexTerm; 333 334 /** 335 * Last saved index term info 336 * 337 * @var Zend_Search_Lucene_Index_TermInfo 338 */ 339 private $_prevIndexTermInfo; 340 341 /** 342 * Last term dictionary file position 343 * 344 * @var integer 345 */ 346 private $_lastIndexPosition; 347 348 /** 349 * Create dicrionary, frequency and positions files and write necessary headers 350 */ 351 public function initializeDictionaryFiles() 352 { 353 $this->_tisFile = $this->_directory->createFile($this->_name . '.tis'); 354 $this->_tisFile->writeInt((int)0xFFFFFFFE); 355 $this->_tisFile->writeLong(0 /* dummy data for terms count */); 356 $this->_tisFile->writeInt(self::$indexInterval); 357 $this->_tisFile->writeInt(self::$skipInterval); 358 359 $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii'); 360 $this->_tiiFile->writeInt((int)0xFFFFFFFE); 361 $this->_tiiFile->writeLong(0 /* dummy data for terms count */); 362 $this->_tiiFile->writeInt(self::$indexInterval); 363 $this->_tiiFile->writeInt(self::$skipInterval); 364 365 /** Dump dictionary header */ 366 $this->_tiiFile->writeVInt(0); // preffix length 367 $this->_tiiFile->writeString(''); // suffix 368 $this->_tiiFile->writeInt((int)0xFFFFFFFF); // field number 369 $this->_tiiFile->writeByte((int)0x0F); 370 $this->_tiiFile->writeVInt(0); // DocFreq 371 $this->_tiiFile->writeVInt(0); // FreqDelta 372 $this->_tiiFile->writeVInt(0); // ProxDelta 373 $this->_tiiFile->writeVInt(20); // IndexDelta 374 375 $this->_frqFile = $this->_directory->createFile($this->_name . '.frq'); 376 $this->_prxFile = $this->_directory->createFile($this->_name . '.prx'); 377 378 $this->_files[] = $this->_name . '.tis'; 379 $this->_files[] = $this->_name . '.tii'; 380 $this->_files[] = $this->_name . '.frq'; 381 $this->_files[] = $this->_name . '.prx'; 382 383 $this->_prevTerm = null; 384 $this->_prevTermInfo = null; 385 $this->_prevIndexTerm = null; 386 $this->_prevIndexTermInfo = null; 387 $this->_lastIndexPosition = 20; 388 $this->_termCount = 0; 389 390 } 391 392 /** 393 * Add term 394 * 395 * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... ) 396 * 397 * @param Zend_Search_Lucene_Index_Term $termEntry 398 * @param array $termDocs 399 */ 400 public function addTerm($termEntry, $termDocs) 401 { 402 $freqPointer = $this->_frqFile->tell(); 403 $proxPointer = $this->_prxFile->tell(); 404 405 $prevDoc = 0; 406 foreach ($termDocs as $docId => $termPositions) { 407 $docDelta = ($docId - $prevDoc)*2; 408 $prevDoc = $docId; 409 if (count($termPositions) > 1) { 410 $this->_frqFile->writeVInt($docDelta); 411 $this->_frqFile->writeVInt(count($termPositions)); 412 } else { 413 $this->_frqFile->writeVInt($docDelta + 1); 414 } 415 416 $prevPosition = 0; 417 foreach ($termPositions as $position) { 418 $this->_prxFile->writeVInt($position - $prevPosition); 419 $prevPosition = $position; 420 } 421 } 422 423 if (count($termDocs) >= self::$skipInterval) { 424 /** 425 * @todo Write Skip Data to a freq file. 426 * It's not used now, but make index more optimal 427 */ 428 $skipOffset = $this->_frqFile->tell() - $freqPointer; 429 } else { 430 $skipOffset = 0; 431 } 432 433 $term = new Zend_Search_Lucene_Index_Term($termEntry->text, 434 $this->_fields[$termEntry->field]->number); 435 $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs), 436 $freqPointer, $proxPointer, $skipOffset); 437 438 $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo); 439 440 if (($this->_termCount + 1) % self::$indexInterval == 0) { 441 $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo); 442 443 $indexPosition = $this->_tisFile->tell(); 444 $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition); 445 $this->_lastIndexPosition = $indexPosition; 446 447 } 448 $this->_termCount++; 449 } 450 451 /** 452 * Close dictionary 453 */ 454 public function closeDictionaryFiles() 455 { 456 $this->_tisFile->seek(4); 457 $this->_tisFile->writeLong($this->_termCount); 458 459 $this->_tiiFile->seek(4); 460 $this->_tiiFile->writeLong(ceil(($this->_termCount + 2)/self::$indexInterval)); 461 } 462 463 464 /** 465 * Dump Term Dictionary segment file entry. 466 * Used to write entry to .tis or .tii files 467 * 468 * @param Zend_Search_Lucene_Storage_File $dicFile 469 * @param Zend_Search_Lucene_Index_Term $prevTerm 470 * @param Zend_Search_Lucene_Index_Term $term 471 * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo 472 * @param Zend_Search_Lucene_Index_TermInfo $termInfo 473 */ 474 protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile, 475 &$prevTerm, Zend_Search_Lucene_Index_Term $term, 476 &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo) 477 { 478 if (isset($prevTerm) && $prevTerm->field == $term->field) { 479 $matchedBytes = 0; 480 $maxBytes = min(strlen($prevTerm->text), strlen($term->text)); 481 while ($matchedBytes < $maxBytes && 482 $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) { 483 $matchedBytes++; 484 } 485 486 // Calculate actual matched UTF-8 pattern 487 $prefixBytes = 0; 488 $prefixChars = 0; 489 while ($prefixBytes < $matchedBytes) { 490 $charBytes = 1; 491 if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) { 492 $charBytes++; 493 if (ord($term->text[$prefixBytes]) & 0x20 ) { 494 $charBytes++; 495 if (ord($term->text[$prefixBytes]) & 0x10 ) { 496 $charBytes++; 497 } 498 } 499 } 500 501 if ($prefixBytes + $charBytes > $matchedBytes) { 502 // char crosses matched bytes boundary 503 // skip char 504 break; 505 } 506 507 $prefixChars++; 508 $prefixBytes += $charBytes; 509 } 510 511 // Write preffix length 512 $dicFile->writeVInt($prefixChars); 513 // Write suffix 514 $dicFile->writeString(substr($term->text, $prefixBytes)); 515 } else { 516 // Write preffix length 517 $dicFile->writeVInt(0); 518 // Write suffix 519 $dicFile->writeString($term->text); 520 } 521 // Write field number 522 $dicFile->writeVInt($term->field); 523 // DocFreq (the count of documents which contain the term) 524 $dicFile->writeVInt($termInfo->docFreq); 525 526 $prevTerm = $term; 527 528 if (!isset($prevTermInfo)) { 529 // Write FreqDelta 530 $dicFile->writeVInt($termInfo->freqPointer); 531 // Write ProxDelta 532 $dicFile->writeVInt($termInfo->proxPointer); 533 } else { 534 // Write FreqDelta 535 $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer); 536 // Write ProxDelta 537 $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer); 538 } 539 // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval 540 if ($termInfo->skipOffset != 0) { 541 $dicFile->writeVInt($termInfo->skipOffset); 542 } 543 544 $prevTermInfo = $termInfo; 545 } 546 547 548 /** 549 * Generate compound index file 550 */ 551 protected function _generateCFS() 552 { 553 $cfsFile = $this->_directory->createFile($this->_name . '.cfs'); 554 $cfsFile->writeVInt(count($this->_files)); 555 556 $dataOffsetPointers = array(); 557 foreach ($this->_files as $fileName) { 558 $dataOffsetPointers[$fileName] = $cfsFile->tell(); 559 $cfsFile->writeLong(0); // write dummy data 560 $cfsFile->writeString($fileName); 561 } 562 563 foreach ($this->_files as $fileName) { 564 // Get actual data offset 565 $dataOffset = $cfsFile->tell(); 566 // Seek to the data offset pointer 567 $cfsFile->seek($dataOffsetPointers[$fileName]); 568 // Write actual data offset value 569 $cfsFile->writeLong($dataOffset); 570 // Seek back to the end of file 571 $cfsFile->seek($dataOffset); 572 573 $dataFile = $this->_directory->getFileObject($fileName); 574 575 $byteCount = $this->_directory->fileLength($fileName); 576 while ($byteCount > 0) { 577 $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/)); 578 $byteCount -= strlen($data); 579 $cfsFile->writeBytes($data); 580 } 581 582 $this->_directory->deleteFile($fileName); 583 } 584 } 585 586 587 /** 588 * Close segment, write it to disk and return segment info 589 * 590 * @return Zend_Search_Lucene_Index_SegmentInfo 591 */ 592 abstract public function close(); 593 } 594
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
| Generated: Wed Jan 14 11:33:29 2009 | Cross-referenced by PHPXref 0.7 |