| [ Index ] |
PHP Cross Reference of Moodle 1.9.3 [Build 15-Oct-2008] |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * Zend Framework 4 * 5 * LICENSE 6 * 7 * This source file is subject to the new BSD license that is bundled 8 * with this package in the file LICENSE.txt. 9 * It is also available through the world-wide-web at this URL: 10 * http://framework.zend.com/license/new-bsd 11 * If you did not receive a copy of the license and are unable to 12 * obtain it through the world-wide-web, please send an email 13 * to license@zend.com so we can send you a copy immediately. 14 * 15 * @category Zend 16 * @package Zend_Search_Lucene 17 * @subpackage Index 18 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) 19 * @license http://framework.zend.com/license/new-bsd New BSD License 20 */ 21 22 /** Zend_Search_Lucene_Index_DictionaryLoader */ 23 require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/DictionaryLoader.php'; 24 25 26 /** Zend_Search_Lucene_Exception */ 27 require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php'; 28 29 30 /** 31 * @category Zend 32 * @package Zend_Search_Lucene 33 * @subpackage Index 34 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com) 35 * @license http://framework.zend.com/license/new-bsd New BSD License 36 */ 37 class Zend_Search_Lucene_Index_SegmentInfo 38 { 39 /** 40 * Number of docs in a segment 41 * 42 * @var integer 43 */ 44 private $_docCount; 45 46 /** 47 * Segment name 48 * 49 * @var string 50 */ 51 private $_name; 52 53 /** 54 * Term Dictionary Index 55 * 56 * Array of arrays (Zend_Search_Lucene_Index_Term objects are represented as arrays because 57 * of performance considerations) 58 * [0] -> $termValue 59 * [1] -> $termFieldNum 60 * 61 * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos 62 * 63 * @var array 64 */ 65 private $_termDictionary; 66 67 /** 68 * Term Dictionary Index TermInfos 69 * 70 * Array of arrays (Zend_Search_Lucene_Index_TermInfo objects are represented as arrays because 71 * of performance considerations) 72 * [0] -> $docFreq 73 * [1] -> $freqPointer 74 * [2] -> $proxPointer 75 * [3] -> $skipOffset 76 * [4] -> $indexPointer 77 * 78 * @var array 79 */ 80 private $_termDictionaryInfos; 81 82 /** 83 * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment 84 * 85 * @var array 86 */ 87 private $_fields; 88 89 /** 90 * Field positions in a dictionary. 91 * (Term dictionary contains filelds ordered by names) 92 * 93 * @var array 94 */ 95 private $_fieldsDicPositions; 96 97 98 /** 99 * Associative array where the key is the file name and the value is data offset 100 * in a compound segment file (.csf). 101 * 102 * @var array 103 */ 104 private $_segFiles; 105 106 /** 107 * Associative array where the key is the file name and the value is file size (.csf). 108 * 109 * @var array 110 */ 111 private $_segFileSizes; 112 113 114 /** 115 * File system adapter. 116 * 117 * @var Zend_Search_Lucene_Storage_Directory_Filesystem 118 */ 119 private $_directory; 120 121 /** 122 * Normalization factors. 123 * An array fieldName => normVector 124 * normVector is a binary string. 125 * Each byte corresponds to an indexed document in a segment and 126 * encodes normalization factor (float value, encoded by 127 * Zend_Search_Lucene_Search_Similarity::encodeNorm()) 128 * 129 * @var array 130 */ 131 private $_norms = array(); 132 133 /** 134 * List of deleted documents. 135 * bitset if bitset extension is loaded or array otherwise. 136 * 137 * @var mixed 138 */ 139 private $_deleted; 140 141 /** 142 * $this->_deleted update flag 143 * 144 * @var boolean 145 */ 146 private $_deletedDirty = false; 147 148 149 /** 150 * Zend_Search_Lucene_Index_SegmentInfo constructor needs Segmentname, 151 * Documents count and Directory as a parameter. 152 * 153 * @param string $name 154 * @param integer $docCount 155 * @param Zend_Search_Lucene_Storage_Directory $directory 156 */ 157 public function __construct($name, $docCount, $directory) 158 { 159 $this->_name = $name; 160 $this->_docCount = $docCount; 161 $this->_directory = $directory; 162 $this->_termDictionary = null; 163 164 $this->_segFiles = array(); 165 if ($this->_directory->fileExists($name . '.cfs')) { 166 $cfsFile = $this->_directory->getFileObject($name . '.cfs'); 167 $segFilesCount = $cfsFile->readVInt(); 168 169 for ($count = 0; $count < $segFilesCount; $count++) { 170 $dataOffset = $cfsFile->readLong(); 171 if ($count != 0) { 172 $this->_segFileSizes[$fileName] = $dataOffset - end($this->_segFiles); 173 } 174 $fileName = $cfsFile->readString(); 175 $this->_segFiles[$fileName] = $dataOffset; 176 } 177 if ($count != 0) { 178 $this->_segFileSizes[$fileName] = $this->_directory->fileLength($name . '.cfs') - $dataOffset; 179 } 180 } 181 182 $fnmFile = $this->openCompoundFile('.fnm'); 183 $fieldsCount = $fnmFile->readVInt(); 184 $fieldNames = array(); 185 $fieldNums = array(); 186 $this->_fields = array(); 187 for ($count=0; $count < $fieldsCount; $count++) { 188 $fieldName = $fnmFile->readString(); 189 $fieldBits = $fnmFile->readByte(); 190 $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName, 191 $fieldBits & 1, 192 $count, 193 $fieldBits & 2 ); 194 if ($fieldBits & 0x10) { 195 // norms are omitted for the indexed field 196 $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount); 197 } 198 199 $fieldNums[$count] = $count; 200 $fieldNames[$count] = $fieldName; 201 } 202 array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums); 203 $this->_fieldsDicPositions = array_flip($fieldNums); 204 205 try { 206 $delFile = $this->openCompoundFile('.del'); 207 208 $byteCount = $delFile->readInt(); 209 $byteCount = ceil($byteCount/8); 210 $bitCount = $delFile->readInt(); 211 212 if ($bitCount == 0) { 213 $delBytes = ''; 214 } else { 215 $delBytes = $delFile->readBytes($byteCount); 216 } 217 218 if (extension_loaded('bitset')) { 219 $this->_deleted = $delBytes; 220 } else { 221 $this->_deleted = array(); 222 for ($count = 0; $count < $byteCount; $count++) { 223 $byte = ord($delBytes{$count}); 224 for ($bit = 0; $bit < 8; $bit++) { 225 if ($byte & (1<<$bit)) { 226 $this->_deleted[$count*8 + $bit] = 1; 227 } 228 } 229 } 230 } 231 } catch(Zend_Search_Exception $e) { 232 if (strpos($e->getMessage(), 'compound file doesn\'t contain') !== false ) { 233 $this->_deleted = null; 234 } else { 235 throw $e; 236 } 237 } 238 } 239 240 /** 241 * Opens index file stoted within compound index file 242 * 243 * @param string $extension 244 * @param boolean $shareHandler 245 * @throws Zend_Search_Lucene_Exception 246 * @return Zend_Search_Lucene_Storage_File 247 */ 248 public function openCompoundFile($extension, $shareHandler = true) 249 { 250 $filename = $this->_name . $extension; 251 252 // Try to open common file first 253 if ($this->_directory->fileExists($filename)) { 254 return $this->_directory->getFileObject($filename, $shareHandler); 255 } 256 257 if( !isset($this->_segFiles[$filename]) ) { 258 throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain ' 259 . $filename . ' file.' ); 260 } 261 262 $file = $this->_directory->getFileObject($this->_name . '.cfs', $shareHandler); 263 $file->seek($this->_segFiles[$filename]); 264 return $file; 265 } 266 267 /** 268 * Get compound file length 269 * 270 * @param string $extension 271 * @return integer 272 */ 273 public function compoundFileLength($extension) 274 { 275 $filename = $this->_name . $extension; 276 277 // Try to get common file first 278 if ($this->_directory->fileExists($filename)) { 279 return $this->_directory->fileLength($filename); 280 } 281 282 if( !isset($this->_segFileSizes[$filename]) ) { 283 throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain ' 284 . $filename . ' file.' ); 285 } 286 287 return $this->_segFileSizes[$filename]; 288 } 289 290 /** 291 * Returns field index or -1 if field is not found 292 * 293 * @param string $fieldName 294 * @return integer 295 */ 296 public function getFieldNum($fieldName) 297 { 298 foreach( $this->_fields as $field ) { 299 if( $field->name == $fieldName ) { 300 return $field->number; 301 } 302 } 303 304 return -1; 305 } 306 307 /** 308 * Returns field info for specified field 309 * 310 * @param integer $fieldNum 311 * @return Zend_Search_Lucene_Index_FieldInfo 312 */ 313 public function getField($fieldNum) 314 { 315 return $this->_fields[$fieldNum]; 316 } 317 318 /** 319 * Returns array of fields. 320 * if $indexed parameter is true, then returns only indexed fields. 321 * 322 * @param boolean $indexed 323 * @return array 324 */ 325 public function getFields($indexed = false) 326 { 327 $result = array(); 328 foreach( $this->_fields as $field ) { 329 if( (!$indexed) || $field->isIndexed ) { 330 $result[ $field->name ] = $field->name; 331 } 332 } 333 return $result; 334 } 335 336 /** 337 * Returns array of FieldInfo objects. 338 * 339 * @return array 340 */ 341 public function getFieldInfos() 342 { 343 return $this->_fields; 344 } 345 346 /** 347 * Returns the total number of documents in this segment (including deleted documents). 348 * 349 * @return integer 350 */ 351 public function count() 352 { 353 return $this->_docCount; 354 } 355 356 /** 357 * Returns number of deleted documents. 358 * 359 * @return integer 360 */ 361 private function _deletedCount() 362 { 363 if ($this->_deleted === null) { 364 return 0; 365 } 366 367 if (extension_loaded('bitset')) { 368 return count(bitset_to_array($this->_deleted)); 369 } else { 370 return count($this->_deleted); 371 } 372 } 373 374 /** 375 * Returns the total number of non-deleted documents in this segment. 376 * 377 * @return integer 378 */ 379 public function numDocs() 380 { 381 if ($this->hasDeletions()) { 382 return $this->_docCount - $this->_deletedCount(); 383 } else { 384 return $this->_docCount; 385 } 386 } 387 388 /** 389 * Get field position in a fields dictionary 390 * 391 * @param integer $fieldNum 392 * @return integer 393 */ 394 private function _getFieldPosition($fieldNum) { 395 // Treat values which are not in a translation table as a 'direct value' 396 return isset($this->_fieldsDicPositions[$fieldNum]) ? 397 $this->_fieldsDicPositions[$fieldNum] : $fieldNum; 398 } 399 400 /** 401 * Return segment name 402 * 403 * @return string 404 */ 405 public function getName() 406 { 407 return $this->_name; 408 } 409 410 411 /** 412 * TermInfo cache 413 * 414 * Size is 1024. 415 * Numbers are used instead of class constants because of performance considerations 416 * 417 * @var array 418 */ 419 private $_termInfoCache = array(); 420 421 private function _cleanUpTermInfoCache() 422 { 423 // Clean 256 term infos 424 foreach ($this->_termInfoCache as $key => $termInfo) { 425 unset($this->_termInfoCache[$key]); 426 427 // leave 768 last used term infos 428 if (count($this->_termInfoCache) == 768) { 429 break; 430 } 431 } 432 } 433 434 /** 435 * Scans terms dictionary and returns term info 436 * 437 * @param Zend_Search_Lucene_Index_Term $term 438 * @return Zend_Search_Lucene_Index_TermInfo 439 */ 440 public function getTermInfo(Zend_Search_Lucene_Index_Term $term) 441 { 442 $termKey = $term->key(); 443 if (isset($this->_termInfoCache[$termKey])) { 444 $termInfo = $this->_termInfoCache[$termKey]; 445 446 // Move termInfo to the end of cache 447 unset($this->_termInfoCache[$termKey]); 448 $this->_termInfoCache[$termKey] = $termInfo; 449 450 return $termInfo; 451 } 452 453 454 if ($this->_termDictionary === null) { 455 // Check, if index is already serialized 456 if ($this->_directory->fileExists($this->_name . '.sti')) { 457 // Prefetch dictionary index data 458 $stiFile = $this->_directory->getFileObject($this->_name . '.sti'); 459 $stiFileData = $stiFile->readBytes($this->_directory->fileLength($this->_name . '.sti')); 460 461 // Load dictionary index data 462 list($this->_termDictionary, $this->_termDictionaryInfos) = unserialize($stiFileData); 463 } else { 464 // Prefetch dictionary index data 465 $tiiFile = $this->openCompoundFile('.tii'); 466 $tiiFileData = $tiiFile->readBytes($this->compoundFileLength('.tii')); 467 468 // Load dictionary index data 469 list($this->_termDictionary, $this->_termDictionaryInfos) = 470 Zend_Search_Lucene_Index_DictionaryLoader::load($tiiFileData); 471 472 $stiFileData = serialize(array($this->_termDictionary, $this->_termDictionaryInfos)); 473 $stiFile = $this->_directory->createFile($this->_name . '.sti'); 474 $stiFile->writeBytes($stiFileData); 475 } 476 477 } 478 479 480 481 $searchField = $this->getFieldNum($term->field); 482 483 if ($searchField == -1) { 484 return null; 485 } 486 $searchDicField = $this->_getFieldPosition($searchField); 487 488 // search for appropriate value in dictionary 489 $lowIndex = 0; 490 $highIndex = count($this->_termDictionary)-1; 491 while ($highIndex >= $lowIndex) { 492 // $mid = ($highIndex - $lowIndex)/2; 493 $mid = ($highIndex + $lowIndex) >> 1; 494 $midTerm = $this->_termDictionary[$mid]; 495 496 $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */); 497 $delta = $searchDicField - $fieldNum; 498 if ($delta == 0) { 499 $delta = strcmp($term->text, $midTerm[1] /* text */); 500 } 501 502 if ($delta < 0) { 503 $highIndex = $mid-1; 504 } elseif ($delta > 0) { 505 $lowIndex = $mid+1; 506 } else { 507 // return $this->_termDictionaryInfos[$mid]; // We got it! 508 $a = $this->_termDictionaryInfos[$mid]; 509 $termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]); 510 511 // Put loaded termInfo into cache 512 $this->_termInfoCache[$termKey] = $termInfo; 513 514 return $termInfo; 515 } 516 } 517 518 if ($highIndex == -1) { 519 // Term is out of the dictionary range 520 return null; 521 } 522 523 $prevPosition = $highIndex; 524 $prevTerm = $this->_termDictionary[$prevPosition]; 525 $prevTermInfo = $this->_termDictionaryInfos[$prevPosition]; 526 527 $tisFile = $this->openCompoundFile('.tis'); 528 $tiVersion = $tisFile->readInt(); 529 if ($tiVersion != (int)0xFFFFFFFE) { 530 throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); 531 } 532 533 $termCount = $tisFile->readLong(); 534 $indexInterval = $tisFile->readInt(); 535 $skipInterval = $tisFile->readInt(); 536 537 $tisFile->seek($prevTermInfo[4] /* indexPointer */ - 20 /* header size*/, SEEK_CUR); 538 539 $termValue = $prevTerm[1] /* text */; 540 $termFieldNum = $prevTerm[0] /* field */; 541 $freqPointer = $prevTermInfo[1] /* freqPointer */; 542 $proxPointer = $prevTermInfo[2] /* proxPointer */; 543 for ($count = $prevPosition*$indexInterval + 1; 544 $count <= $termCount && 545 ( $this->_getFieldPosition($termFieldNum) < $searchDicField || 546 ($this->_getFieldPosition($termFieldNum) == $searchDicField && 547 strcmp($termValue, $term->text) < 0) ); 548 $count++) { 549 $termPrefixLength = $tisFile->readVInt(); 550 $termSuffix = $tisFile->readString(); 551 $termFieldNum = $tisFile->readVInt(); 552 $termValue = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix; 553 554 $docFreq = $tisFile->readVInt(); 555 $freqPointer += $tisFile->readVInt(); 556 $proxPointer += $tisFile->readVInt(); 557 if( $docFreq >= $skipInterval ) { 558 $skipOffset = $tisFile->readVInt(); 559 } else { 560 $skipOffset = 0; 561 } 562 } 563 564 if ($termFieldNum == $searchField && $termValue == $term->text) { 565 $termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); 566 } else { 567 $termInfo = null; 568 } 569 570 // Put loaded termInfo into cache 571 $this->_termInfoCache[$termKey] = $termInfo; 572 573 if (count($this->_termInfoCache) == 1024) { 574 $this->_cleanUpTermInfoCache(); 575 } 576 577 return $termInfo; 578 } 579 580 /** 581 * Returns term freqs array. 582 * Result array structure: array(docId => freq, ...) 583 * 584 * @param Zend_Search_Lucene_Index_Term $term 585 * @param integer $shift 586 * @return Zend_Search_Lucene_Index_TermInfo 587 */ 588 public function termFreqs(Zend_Search_Lucene_Index_Term $term, $shift = 0) 589 { 590 $termInfo = $this->getTermInfo($term); 591 592 if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { 593 return array(); 594 } 595 596 $frqFile = $this->openCompoundFile('.frq'); 597 $frqFile->seek($termInfo->freqPointer,SEEK_CUR); 598 $result = array(); 599 $docId = 0; 600 601 for ($count = 0; $count < $termInfo->docFreq; $count++) { 602 $docDelta = $frqFile->readVInt(); 603 if ($docDelta % 2 == 1) { 604 $docId += ($docDelta-1)/2; 605 $result[$shift + $docId] = 1; 606 } else { 607 $docId += $docDelta/2; 608 $result[$shift + $docId] = $frqFile->readVInt(); 609 } 610 } 611 612 return $result; 613 } 614 615 /** 616 * Returns term positions array. 617 * Result array structure: array(docId => array(pos1, pos2, ...), ...) 618 * 619 * @param Zend_Search_Lucene_Index_Term $term 620 * @param integer $shift 621 * @return Zend_Search_Lucene_Index_TermInfo 622 */ 623 public function termPositions(Zend_Search_Lucene_Index_Term $term, $shift = 0) 624 { 625 $termInfo = $this->getTermInfo($term); 626 627 if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { 628 return array(); 629 } 630 631 $frqFile = $this->openCompoundFile('.frq'); 632 $frqFile->seek($termInfo->freqPointer,SEEK_CUR); 633 $freqs = array(); 634 $docId = 0; 635 636 for ($count = 0; $count < $termInfo->docFreq; $count++) { 637 $docDelta = $frqFile->readVInt(); 638 if ($docDelta % 2 == 1) { 639 $docId += ($docDelta-1)/2; 640 $freqs[$docId] = 1; 641 } else { 642 $docId += $docDelta/2; 643 $freqs[$docId] = $frqFile->readVInt(); 644 } 645 } 646 647 $result = array(); 648 $prxFile = $this->openCompoundFile('.prx'); 649 $prxFile->seek($termInfo->proxPointer, SEEK_CUR); 650 foreach ($freqs as $docId => $freq) { 651 $termPosition = 0; 652 $positions = array(); 653 654 for ($count = 0; $count < $freq; $count++ ) { 655 $termPosition += $prxFile->readVInt(); 656 $positions[] = $termPosition; 657 } 658 659 $result[$shift + $docId] = $positions; 660 } 661 662 return $result; 663 } 664 665 /** 666 * Load normalizatin factors from an index file 667 * 668 * @param integer $fieldNum 669 */ 670 private function _loadNorm($fieldNum) 671 { 672 $fFile = $this->openCompoundFile('.f' . $fieldNum); 673 $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount); 674 } 675 676 /** 677 * Returns normalization factor for specified documents 678 * 679 * @param integer $id 680 * @param string $fieldName 681 * @return float 682 */ 683 public function norm($id, $fieldName) 684 { 685 $fieldNum = $this->getFieldNum($fieldName); 686 687 if ( !($this->_fields[$fieldNum]->isIndexed) ) { 688 return null; 689 } 690 691 if (!isset($this->_norms[$fieldNum])) { 692 $this->_loadNorm($fieldNum); 693 } 694 695 return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum]{$id}) ); 696 } 697 698 /** 699 * Returns norm vector, encoded in a byte string 700 * 701 * @param string $fieldName 702 * @return string 703 */ 704 public function normVector($fieldName) 705 { 706 $fieldNum = $this->getFieldNum($fieldName); 707 708 if ($fieldNum == -1 || !($this->_fields[$fieldNum]->isIndexed)) { 709 $similarity = Zend_Search_Lucene_Search_Similarity::getDefault(); 710 711 return str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )), 712 $this->_docCount); 713 } 714 715 if (!isset($this->_norms[$fieldNum])) { 716 $this->_loadNorm($fieldNum); 717 } 718 719 return $this->_norms[$fieldNum]; 720 } 721 722 723 /** 724 * Returns true if any documents have been deleted from this index segment. 725 * 726 * @return boolean 727 */ 728 public function hasDeletions() 729 { 730 return $this->_deleted !== null; 731 } 732 733 734 /** 735 * Deletes a document from the index segment. 736 * $id is an internal document id 737 * 738 * @param integer 739 */ 740 public function delete($id) 741 { 742 $this->_deletedDirty = true; 743 744 if (extension_loaded('bitset')) { 745 if ($this->_deleted === null) { 746 $this->_deleted = bitset_empty($id); 747 } 748 bitset_incl($this->_deleted, $id); 749 } else { 750 if ($this->_deleted === null) { 751 $this->_deleted = array(); 752 } 753 754 $this->_deleted[$id] = 1; 755 } 756 } 757 758 /** 759 * Checks, that document is deleted 760 * 761 * @param integer 762 * @return boolean 763 */ 764 public function isDeleted($id) 765 { 766 if ($this->_deleted === null) { 767 return false; 768 } 769 770 if (extension_loaded('bitset')) { 771 return bitset_in($this->_deleted, $id); 772 } else { 773 return isset($this->_deleted[$id]); 774 } 775 } 776 777 778 /** 779 * Write changes if it's necessary. 780 */ 781 public function writeChanges() 782 { 783 if (!$this->_deletedDirty) { 784 return; 785 } 786 787 if (extension_loaded('bitset')) { 788 $delBytes = $this->_deleted; 789 $bitCount = count(bitset_to_array($delBytes)); 790 } else { 791 $byteCount = floor($this->_docCount/8)+1; 792 $delBytes = str_repeat(chr(0), $byteCount); 793 for ($count = 0; $count < $byteCount; $count++) { 794 $byte = 0; 795 for ($bit = 0; $bit < 8; $bit++) { 796 if (isset($this->_deleted[$count*8 + $bit])) { 797 $byte |= (1<<$bit); 798 } 799 } 800 $delBytes{$count} = chr($byte); 801 } 802 $bitCount = count($this->_deleted); 803 } 804 805 806 $delFile = $this->_directory->createFile($this->_name . '.del'); 807 $delFile->writeInt($this->_docCount); 808 $delFile->writeInt($bitCount); 809 $delFile->writeBytes($delBytes); 810 811 $this->_deletedDirty = false; 812 } 813 814 815 816 /** 817 * Term Dictionary File object for stream like terms reading 818 * 819 * @var Zend_Search_Lucene_Storage_File 820 */ 821 private $_tisFile = null; 822 823 /** 824 * Frequencies File object for stream like terms reading 825 * 826 * @var Zend_Search_Lucene_Storage_File 827 */ 828 private $_frqFile = null; 829 830 /** 831 * Offset of the .frq file in the compound file 832 * 833 * @var integer 834 */ 835 private $_frqFileOffset; 836 837 /** 838 * Positions File object for stream like terms reading 839 * 840 * @var Zend_Search_Lucene_Storage_File 841 */ 842 private $_prxFile = null; 843 844 /** 845 * Offset of the .prx file in the compound file 846 * 847 * @var integer 848 */ 849 private $_prxFileOffset; 850 851 852 /** 853 * Number of terms in term stream 854 * 855 * @var integer 856 */ 857 private $_termCount = 0; 858 859 /** 860 * Segment skip interval 861 * 862 * @var integer 863 */ 864 private $_skipInterval; 865 866 /** 867 * Last TermInfo in a terms stream 868 * 869 * @var Zend_Search_Lucene_Index_TermInfo 870 */ 871 private $_lastTermInfo = null; 872 873 /** 874 * Last Term in a terms stream 875 * 876 * @var Zend_Search_Lucene_Index_Term 877 */ 878 private $_lastTerm = null; 879 880 /** 881 * Map of the document IDs 882 * Used to get new docID after removing deleted documents. 883 * It's not very effective from memory usage point of view, 884 * but much more faster, then other methods 885 * 886 * @var array|null 887 */ 888 private $_docMap = null; 889 890 /** 891 * An array of all term positions in the documents. 892 * Array structure: array( docId => array( pos1, pos2, ...), ...) 893 * 894 * @var array 895 */ 896 private $_lastTermPositions; 897 898 /** 899 * Reset terms stream 900 * 901 * $startId - id for the fist document 902 * $compact - remove deleted documents 903 * 904 * Returns start document id for the next segment 905 * 906 * @param integer $startId 907 * @param boolean $compact 908 * @throws Zend_Search_Lucene_Exception 909 * @return integer 910 */ 911 public function reset($startId = 0, $compact = false) 912 { 913 if ($this->_tisFile !== null) { 914 $this->_tisFile = null; 915 } 916 917 $this->_tisFile = $this->openCompoundFile('.tis', false); 918 $tiVersion = $this->_tisFile->readInt(); 919 if ($tiVersion != (int)0xFFFFFFFE) { 920 throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); 921 } 922 923 $this->_termCount = $this->_tisFile->readLong(); 924 $this->_tisFile->readInt(); // Read Index interval 925 $this->_skipInterval = $this->_tisFile->readInt(); // Read skip interval 926 927 if ($this->_frqFile !== null) { 928 $this->_frqFile = null; 929 } 930 $this->_frqFile = $this->openCompoundFile('.frq', false); 931 $this->_frqFileOffset = $this->_frqFile->tell(); 932 933 if ($this->_prxFile !== null) { 934 $this->_prxFile = null; 935 } 936 $this->_prxFile = $this->openCompoundFile('.prx', false); 937 $this->_prxFileOffset = $this->_prxFile->tell(); 938 939 $this->_lastTerm = new Zend_Search_Lucene_Index_Term('', -1); 940 $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0); 941 942 $this->_docMap = array(); 943 for ($count = 0; $count < $this->_docCount; $count++) { 944 if (!$this->isDeleted($count)) { 945 $this->_docMap[$count] = $startId + ($compact ? count($this->_docMap) : $count); 946 } 947 } 948 949 $this->nextTerm(); 950 return $startId + ($compact ? count($this->_docMap) : $this->_docCount); 951 } 952 953 954 /** 955 * Scans terms dictionary and returns next term 956 * 957 * @return Zend_Search_Lucene_Index_Term|null 958 */ 959 public function nextTerm() 960 { 961 if ($this->_tisFile === null || $this->_termCount == 0) { 962 $this->_lastTerm = null; 963 $this->_lastTermInfo = null; 964 965 // may be necessary for "empty" segment 966 $this->_tisFile = null; 967 $this->_frqFile = null; 968 $this->_prxFile = null; 969 970 return null; 971 } 972 973 $termPrefixLength = $this->_tisFile->readVInt(); 974 $termSuffix = $this->_tisFile->readString(); 975 $termFieldNum = $this->_tisFile->readVInt(); 976 $termValue = Zend_Search_Lucene_Index_Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix; 977 978 $this->_lastTerm = new Zend_Search_Lucene_Index_Term($termValue, $this->_fields[$termFieldNum]->name); 979 980 $docFreq = $this->_tisFile->readVInt(); 981 $freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt(); 982 $proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt(); 983 if ($docFreq >= $this->_skipInterval) { 984 $skipOffset = $this->_tisFile->readVInt(); 985 } else { 986 $skipOffset = 0; 987 } 988 989 $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); 990 991 992 $this->_lastTermPositions = array(); 993 994 $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET); 995 $freqs = array(); $docId = 0; 996 for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) { 997 $docDelta = $this->_frqFile->readVInt(); 998 if( $docDelta % 2 == 1 ) { 999 $docId += ($docDelta-1)/2; 1000 $freqs[ $docId ] = 1; 1001 } else { 1002 $docId += $docDelta/2; 1003 $freqs[ $docId ] = $this->_frqFile->readVInt(); 1004 } 1005 } 1006 1007 $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET); 1008 foreach ($freqs as $docId => $freq) { 1009 $termPosition = 0; $positions = array(); 1010 1011 for ($count = 0; $count < $freq; $count++ ) { 1012 $termPosition += $this->_prxFile->readVInt(); 1013 $positions[] = $termPosition; 1014 } 1015 1016 if (isset($this->_docMap[$docId])) { 1017 $this->_lastTermPositions[$this->_docMap[$docId]] = $positions; 1018 } 1019 } 1020 1021 1022 $this->_termCount--; 1023 if ($this->_termCount == 0) { 1024 $this->_tisFile = null; 1025 $this->_frqFile = null; 1026 $this->_prxFile = null; 1027 } 1028 1029 return $this->_lastTerm; 1030 } 1031 1032 1033 /** 1034 * Returns term in current position 1035 * 1036 * @param Zend_Search_Lucene_Index_Term $term 1037 * @return Zend_Search_Lucene_Index_Term|null 1038 */ 1039 public function currentTerm() 1040 { 1041 return $this->_lastTerm; 1042 } 1043 1044 1045 /** 1046 * Returns an array of all term positions in the documents. 1047 * Return array structure: array( docId => array( pos1, pos2, ...), ...) 1048 * 1049 * @return array 1050 */ 1051 public function currentTermPositions() 1052 { 1053 return $this->_lastTermPositions; 1054 } 1055 } 1056
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
| Generated: Wed Jan 14 11:33:29 2009 | Cross-referenced by PHPXref 0.7 |