[ Index ]

PHP Cross Reference of Moodle 1.9.3 [Build 15-Oct-2008]

title

Body

[close]

/search/Zend/Search/Lucene/Index/ -> SegmentInfo.php (source)

   1  <?php
   2  /**
   3   * Zend Framework
   4   *
   5   * LICENSE
   6   *
   7   * This source file is subject to the new BSD license that is bundled
   8   * with this package in the file LICENSE.txt.
   9   * It is also available through the world-wide-web at this URL:
  10   * http://framework.zend.com/license/new-bsd
  11   * If you did not receive a copy of the license and are unable to
  12   * obtain it through the world-wide-web, please send an email
  13   * to license@zend.com so we can send you a copy immediately.
  14   *
  15   * @category   Zend
  16   * @package    Zend_Search_Lucene
  17   * @subpackage Index
  18   * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  19   * @license    http://framework.zend.com/license/new-bsd     New BSD License
  20   */
  21  
  22  /** Zend_Search_Lucene_Index_DictionaryLoader */
  23  require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/DictionaryLoader.php';
  24  
  25  
  26  /** Zend_Search_Lucene_Exception */
  27  require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
  28  
  29  
  30  /**
  31   * @category   Zend
  32   * @package    Zend_Search_Lucene
  33   * @subpackage Index
  34   * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  35   * @license    http://framework.zend.com/license/new-bsd     New BSD License
  36   */
  37  class Zend_Search_Lucene_Index_SegmentInfo
  38  {
  39      /**
  40       * Number of docs in a segment
  41       *
  42       * @var integer
  43       */
  44      private $_docCount;
  45  
  46      /**
  47       * Segment name
  48       *
  49       * @var string
  50       */
  51      private $_name;
  52  
  53      /**
  54       * Term Dictionary Index
  55       *
  56       * Array of arrays (Zend_Search_Lucene_Index_Term objects are represented as arrays because
  57       * of performance considerations)
  58       * [0] -> $termValue
  59       * [1] -> $termFieldNum
  60       *
  61       * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
  62       *
  63       * @var array
  64       */
  65      private $_termDictionary;
  66  
  67      /**
  68       * Term Dictionary Index TermInfos
  69       *
  70       * Array of arrays (Zend_Search_Lucene_Index_TermInfo objects are represented as arrays because
  71       * of performance considerations)
  72       * [0] -> $docFreq
  73       * [1] -> $freqPointer
  74       * [2] -> $proxPointer
  75       * [3] -> $skipOffset
  76       * [4] -> $indexPointer
  77       *
  78       * @var array
  79       */
  80      private $_termDictionaryInfos;
  81  
  82      /**
  83       * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
  84       *
  85       * @var array
  86       */
  87      private $_fields;
  88  
  89      /**
  90       * Field positions in a dictionary.
  91       * (Term dictionary contains filelds ordered by names)
  92       *
  93       * @var array
  94       */
  95      private $_fieldsDicPositions;
  96  
  97  
  98      /**
  99       * Associative array where the key is the file name and the value is data offset
 100       * in a compound segment file (.csf).
 101       *
 102       * @var array
 103       */
 104      private $_segFiles;
 105  
 106      /**
 107       * Associative array where the key is the file name and the value is file size (.csf).
 108       *
 109       * @var array
 110       */
 111      private $_segFileSizes;
 112  
 113  
 114      /**
 115       * File system adapter.
 116       *
 117       * @var Zend_Search_Lucene_Storage_Directory_Filesystem
 118       */
 119      private $_directory;
 120  
 121      /**
 122       * Normalization factors.
 123       * An array fieldName => normVector
 124       * normVector is a binary string.
 125       * Each byte corresponds to an indexed document in a segment and
 126       * encodes normalization factor (float value, encoded by
 127       * Zend_Search_Lucene_Search_Similarity::encodeNorm())
 128       *
 129       * @var array
 130       */
 131      private $_norms = array();
 132  
 133      /**
 134       * List of deleted documents.
 135       * bitset if bitset extension is loaded or array otherwise.
 136       *
 137       * @var mixed
 138       */
 139      private $_deleted;
 140  
 141      /**
 142       * $this->_deleted update flag
 143       *
 144       * @var boolean
 145       */
 146      private $_deletedDirty = false;
 147  
 148  
 149      /**
 150       * Zend_Search_Lucene_Index_SegmentInfo constructor needs Segmentname,
 151       * Documents count and Directory as a parameter.
 152       *
 153       * @param string $name
 154       * @param integer $docCount
 155       * @param Zend_Search_Lucene_Storage_Directory $directory
 156       */
 157      public function __construct($name, $docCount, $directory)
 158      {
 159          $this->_name = $name;
 160          $this->_docCount = $docCount;
 161          $this->_directory = $directory;
 162          $this->_termDictionary = null;
 163  
 164          $this->_segFiles = array();
 165          if ($this->_directory->fileExists($name . '.cfs')) {
 166              $cfsFile = $this->_directory->getFileObject($name . '.cfs');
 167              $segFilesCount = $cfsFile->readVInt();
 168  
 169              for ($count = 0; $count < $segFilesCount; $count++) {
 170                  $dataOffset = $cfsFile->readLong();
 171                  if ($count != 0) {
 172                      $this->_segFileSizes[$fileName] = $dataOffset - end($this->_segFiles);
 173                  }
 174                  $fileName = $cfsFile->readString();
 175                  $this->_segFiles[$fileName] = $dataOffset;
 176              }
 177              if ($count != 0) {
 178                  $this->_segFileSizes[$fileName] = $this->_directory->fileLength($name . '.cfs') - $dataOffset;
 179              }
 180          }
 181  
 182          $fnmFile = $this->openCompoundFile('.fnm');
 183          $fieldsCount = $fnmFile->readVInt();
 184          $fieldNames = array();
 185          $fieldNums  = array();
 186          $this->_fields = array();
 187          for ($count=0; $count < $fieldsCount; $count++) {
 188              $fieldName = $fnmFile->readString();
 189              $fieldBits = $fnmFile->readByte();
 190              $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName,
 191                                                                              $fieldBits & 1,
 192                                                                              $count,
 193                                                                              $fieldBits & 2 );
 194              if ($fieldBits & 0x10) {
 195                  // norms are omitted for the indexed field
 196                  $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount);
 197              }
 198  
 199              $fieldNums[$count]  = $count;
 200              $fieldNames[$count] = $fieldName;
 201          }
 202          array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums);
 203          $this->_fieldsDicPositions = array_flip($fieldNums);
 204  
 205          try {
 206              $delFile = $this->openCompoundFile('.del');
 207  
 208              $byteCount = $delFile->readInt();
 209              $byteCount = ceil($byteCount/8);
 210              $bitCount  = $delFile->readInt();
 211  
 212              if ($bitCount == 0) {
 213                  $delBytes = '';
 214              } else {
 215                  $delBytes = $delFile->readBytes($byteCount);
 216              }
 217  
 218              if (extension_loaded('bitset')) {
 219                  $this->_deleted = $delBytes;
 220              } else {
 221                  $this->_deleted = array();
 222                  for ($count = 0; $count < $byteCount; $count++) {
 223                      $byte = ord($delBytes{$count});
 224                      for ($bit = 0; $bit < 8; $bit++) {
 225                          if ($byte & (1<<$bit)) {
 226                              $this->_deleted[$count*8 + $bit] = 1;
 227                          }
 228                      }
 229                  }
 230              }
 231          } catch(Zend_Search_Exception $e) {
 232              if (strpos($e->getMessage(), 'compound file doesn\'t contain') !== false ) {
 233                  $this->_deleted = null;
 234              } else {
 235                  throw $e;
 236              }
 237          }
 238      }
 239  
 240      /**
 241       * Opens index file stoted within compound index file
 242       *
 243       * @param string $extension
 244       * @param boolean $shareHandler
 245       * @throws Zend_Search_Lucene_Exception
 246       * @return Zend_Search_Lucene_Storage_File
 247       */
 248      public function openCompoundFile($extension, $shareHandler = true)
 249      {
 250          $filename = $this->_name . $extension;
 251  
 252          // Try to open common file first
 253          if ($this->_directory->fileExists($filename)) {
 254              return $this->_directory->getFileObject($filename, $shareHandler);
 255          }
 256  
 257          if( !isset($this->_segFiles[$filename]) ) {
 258              throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain '
 259                                         . $filename . ' file.' );
 260          }
 261  
 262          $file = $this->_directory->getFileObject($this->_name . '.cfs', $shareHandler);
 263          $file->seek($this->_segFiles[$filename]);
 264          return $file;
 265      }
 266  
 267      /**
 268       * Get compound file length
 269       *
 270       * @param string $extension
 271       * @return integer
 272       */
 273      public function compoundFileLength($extension)
 274      {
 275          $filename = $this->_name . $extension;
 276  
 277          // Try to get common file first
 278          if ($this->_directory->fileExists($filename)) {
 279              return $this->_directory->fileLength($filename);
 280          }
 281  
 282          if( !isset($this->_segFileSizes[$filename]) ) {
 283              throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain '
 284                                         . $filename . ' file.' );
 285          }
 286  
 287          return $this->_segFileSizes[$filename];
 288      }
 289  
 290      /**
 291       * Returns field index or -1 if field is not found
 292       *
 293       * @param string $fieldName
 294       * @return integer
 295       */
 296      public function getFieldNum($fieldName)
 297      {
 298          foreach( $this->_fields as $field ) {
 299              if( $field->name == $fieldName ) {
 300                  return $field->number;
 301              }
 302          }
 303  
 304          return -1;
 305      }
 306  
 307      /**
 308       * Returns field info for specified field
 309       *
 310       * @param integer $fieldNum
 311       * @return Zend_Search_Lucene_Index_FieldInfo
 312       */
 313      public function getField($fieldNum)
 314      {
 315          return $this->_fields[$fieldNum];
 316      }
 317  
 318      /**
 319       * Returns array of fields.
 320       * if $indexed parameter is true, then returns only indexed fields.
 321       *
 322       * @param boolean $indexed
 323       * @return array
 324       */
 325      public function getFields($indexed = false)
 326      {
 327          $result = array();
 328          foreach( $this->_fields as $field ) {
 329              if( (!$indexed) || $field->isIndexed ) {
 330                  $result[ $field->name ] = $field->name;
 331              }
 332          }
 333          return $result;
 334      }
 335  
 336      /**
 337       * Returns array of FieldInfo objects.
 338       *
 339       * @return array
 340       */
 341      public function getFieldInfos()
 342      {
 343          return $this->_fields;
 344      }
 345  
 346      /**
 347       * Returns the total number of documents in this segment (including deleted documents).
 348       *
 349       * @return integer
 350       */
 351      public function count()
 352      {
 353          return $this->_docCount;
 354      }
 355  
 356      /**
 357       * Returns number of deleted documents.
 358       *
 359       * @return integer
 360       */
 361      private function _deletedCount()
 362      {
 363          if ($this->_deleted === null) {
 364              return 0;
 365          }
 366  
 367          if (extension_loaded('bitset')) {
 368              return count(bitset_to_array($this->_deleted));
 369          } else {
 370              return count($this->_deleted);
 371          }
 372      }
 373  
 374      /**
 375       * Returns the total number of non-deleted documents in this segment.
 376       *
 377       * @return integer
 378       */
 379      public function numDocs()
 380      {
 381          if ($this->hasDeletions()) {
 382              return $this->_docCount - $this->_deletedCount();
 383          } else {
 384              return $this->_docCount;
 385          }
 386      }
 387  
 388      /**
 389       * Get field position in a fields dictionary
 390       *
 391       * @param integer $fieldNum
 392       * @return integer
 393       */
 394      private function _getFieldPosition($fieldNum) {
 395          // Treat values which are not in a translation table as a 'direct value'
 396          return isset($this->_fieldsDicPositions[$fieldNum]) ?
 397                             $this->_fieldsDicPositions[$fieldNum] : $fieldNum;
 398      }
 399  
 400      /**
 401       * Return segment name
 402       *
 403       * @return string
 404       */
 405      public function getName()
 406      {
 407          return $this->_name;
 408      }
 409  
 410  
 411      /**
 412       * TermInfo cache
 413       *
 414       * Size is 1024.
 415       * Numbers are used instead of class constants because of performance considerations
 416       *
 417       * @var array
 418       */
 419      private $_termInfoCache = array();
 420  
 421      private function _cleanUpTermInfoCache()
 422      {
 423          // Clean 256 term infos
 424          foreach ($this->_termInfoCache as $key => $termInfo) {
 425              unset($this->_termInfoCache[$key]);
 426  
 427              // leave 768 last used term infos
 428              if (count($this->_termInfoCache) == 768) {
 429                  break;
 430              }
 431          }
 432      }
 433  
 434      /**
 435       * Scans terms dictionary and returns term info
 436       *
 437       * @param Zend_Search_Lucene_Index_Term $term
 438       * @return Zend_Search_Lucene_Index_TermInfo
 439       */
 440      public function getTermInfo(Zend_Search_Lucene_Index_Term $term)
 441      {
 442          $termKey = $term->key();
 443          if (isset($this->_termInfoCache[$termKey])) {
 444              $termInfo = $this->_termInfoCache[$termKey];
 445  
 446              // Move termInfo to the end of cache
 447              unset($this->_termInfoCache[$termKey]);
 448              $this->_termInfoCache[$termKey] = $termInfo;
 449  
 450              return $termInfo;
 451          }
 452  
 453  
 454          if ($this->_termDictionary === null) {
 455              // Check, if index is already serialized
 456              if ($this->_directory->fileExists($this->_name . '.sti')) {
 457                  // Prefetch dictionary index data
 458                  $stiFile = $this->_directory->getFileObject($this->_name . '.sti');
 459                  $stiFileData = $stiFile->readBytes($this->_directory->fileLength($this->_name . '.sti'));
 460  
 461                  // Load dictionary index data
 462                  list($this->_termDictionary, $this->_termDictionaryInfos) = unserialize($stiFileData);
 463              } else {
 464                  // Prefetch dictionary index data
 465                  $tiiFile = $this->openCompoundFile('.tii');
 466                  $tiiFileData = $tiiFile->readBytes($this->compoundFileLength('.tii'));
 467  
 468                  // Load dictionary index data
 469                  list($this->_termDictionary, $this->_termDictionaryInfos) =
 470                              Zend_Search_Lucene_Index_DictionaryLoader::load($tiiFileData);
 471  
 472                  $stiFileData = serialize(array($this->_termDictionary, $this->_termDictionaryInfos));
 473                  $stiFile = $this->_directory->createFile($this->_name . '.sti');
 474                  $stiFile->writeBytes($stiFileData);
 475              }
 476  
 477          }
 478  
 479  
 480  
 481          $searchField = $this->getFieldNum($term->field);
 482  
 483          if ($searchField == -1) {
 484              return null;
 485          }
 486          $searchDicField = $this->_getFieldPosition($searchField);
 487  
 488          // search for appropriate value in dictionary
 489          $lowIndex = 0;
 490          $highIndex = count($this->_termDictionary)-1;
 491          while ($highIndex >= $lowIndex) {
 492              // $mid = ($highIndex - $lowIndex)/2;
 493              $mid = ($highIndex + $lowIndex) >> 1;
 494              $midTerm = $this->_termDictionary[$mid];
 495  
 496              $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */);
 497              $delta = $searchDicField - $fieldNum;
 498              if ($delta == 0) {
 499                  $delta = strcmp($term->text, $midTerm[1] /* text */);
 500              }
 501  
 502              if ($delta < 0) {
 503                  $highIndex = $mid-1;
 504              } elseif ($delta > 0) {
 505                  $lowIndex  = $mid+1;
 506              } else {
 507                  // return $this->_termDictionaryInfos[$mid]; // We got it!
 508                  $a = $this->_termDictionaryInfos[$mid];
 509                  $termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]);
 510  
 511                  // Put loaded termInfo into cache
 512                  $this->_termInfoCache[$termKey] = $termInfo;
 513  
 514                  return $termInfo;
 515              }
 516          }
 517  
 518          if ($highIndex == -1) {
 519              // Term is out of the dictionary range
 520              return null;
 521          }
 522  
 523          $prevPosition = $highIndex;
 524          $prevTerm = $this->_termDictionary[$prevPosition];
 525          $prevTermInfo = $this->_termDictionaryInfos[$prevPosition];
 526  
 527          $tisFile = $this->openCompoundFile('.tis');
 528          $tiVersion = $tisFile->readInt();
 529          if ($tiVersion != (int)0xFFFFFFFE) {
 530              throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
 531          }
 532  
 533          $termCount     = $tisFile->readLong();
 534          $indexInterval = $tisFile->readInt();
 535          $skipInterval  = $tisFile->readInt();
 536  
 537          $tisFile->seek($prevTermInfo[4] /* indexPointer */ - 20 /* header size*/, SEEK_CUR);
 538  
 539          $termValue    = $prevTerm[1] /* text */;
 540          $termFieldNum = $prevTerm[0] /* field */;
 541          $freqPointer = $prevTermInfo[1] /* freqPointer */;
 542          $proxPointer = $prevTermInfo[2] /* proxPointer */;
 543          for ($count = $prevPosition*$indexInterval + 1;
 544               $count <= $termCount &&
 545               ( $this->_getFieldPosition($termFieldNum) < $searchDicField ||
 546                ($this->_getFieldPosition($termFieldNum) == $searchDicField &&
 547                 strcmp($termValue, $term->text) < 0) );
 548               $count++) {
 549              $termPrefixLength = $tisFile->readVInt();
 550              $termSuffix       = $tisFile->readString();
 551              $termFieldNum     = $tisFile->readVInt();
 552              $termValue        = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix;
 553  
 554              $docFreq      = $tisFile->readVInt();
 555              $freqPointer += $tisFile->readVInt();
 556              $proxPointer += $tisFile->readVInt();
 557              if( $docFreq >= $skipInterval ) {
 558                  $skipOffset = $tisFile->readVInt();
 559              } else {
 560                  $skipOffset = 0;
 561              }
 562          }
 563  
 564          if ($termFieldNum == $searchField && $termValue == $term->text) {
 565              $termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
 566          } else {
 567              $termInfo = null;
 568          }
 569  
 570          // Put loaded termInfo into cache
 571          $this->_termInfoCache[$termKey] = $termInfo;
 572  
 573          if (count($this->_termInfoCache) == 1024) {
 574              $this->_cleanUpTermInfoCache();
 575          }
 576  
 577          return $termInfo;
 578      }
 579  
 580      /**
 581       * Returns term freqs array.
 582       * Result array structure: array(docId => freq, ...)
 583       *
 584       * @param Zend_Search_Lucene_Index_Term $term
 585       * @param integer $shift
 586       * @return Zend_Search_Lucene_Index_TermInfo
 587       */
 588      public function termFreqs(Zend_Search_Lucene_Index_Term $term, $shift = 0)
 589      {
 590          $termInfo = $this->getTermInfo($term);
 591  
 592          if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
 593              return array();
 594          }
 595  
 596          $frqFile = $this->openCompoundFile('.frq');
 597          $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
 598          $result = array();
 599          $docId = 0;
 600  
 601          for ($count = 0; $count < $termInfo->docFreq; $count++) {
 602              $docDelta = $frqFile->readVInt();
 603              if ($docDelta % 2 == 1) {
 604                  $docId += ($docDelta-1)/2;
 605                  $result[$shift + $docId] = 1;
 606              } else {
 607                  $docId += $docDelta/2;
 608                  $result[$shift + $docId] = $frqFile->readVInt();
 609              }
 610          }
 611  
 612          return $result;
 613      }
 614  
 615      /**
 616       * Returns term positions array.
 617       * Result array structure: array(docId => array(pos1, pos2, ...), ...)
 618       *
 619       * @param Zend_Search_Lucene_Index_Term $term
 620       * @param integer $shift
 621       * @return Zend_Search_Lucene_Index_TermInfo
 622       */
 623      public function termPositions(Zend_Search_Lucene_Index_Term $term, $shift = 0)
 624      {
 625          $termInfo = $this->getTermInfo($term);
 626  
 627          if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
 628              return array();
 629          }
 630  
 631          $frqFile = $this->openCompoundFile('.frq');
 632          $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
 633          $freqs = array();
 634          $docId = 0;
 635  
 636          for ($count = 0; $count < $termInfo->docFreq; $count++) {
 637              $docDelta = $frqFile->readVInt();
 638              if ($docDelta % 2 == 1) {
 639                  $docId += ($docDelta-1)/2;
 640                  $freqs[$docId] = 1;
 641              } else {
 642                  $docId += $docDelta/2;
 643                  $freqs[$docId] = $frqFile->readVInt();
 644              }
 645          }
 646  
 647          $result = array();
 648          $prxFile = $this->openCompoundFile('.prx');
 649          $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
 650          foreach ($freqs as $docId => $freq) {
 651              $termPosition = 0;
 652              $positions = array();
 653  
 654              for ($count = 0; $count < $freq; $count++ ) {
 655                  $termPosition += $prxFile->readVInt();
 656                  $positions[] = $termPosition;
 657              }
 658  
 659              $result[$shift + $docId] = $positions;
 660          }
 661  
 662          return $result;
 663      }
 664  
 665      /**
 666       * Load normalizatin factors from an index file
 667       *
 668       * @param integer $fieldNum
 669       */
 670      private function _loadNorm($fieldNum)
 671      {
 672          $fFile = $this->openCompoundFile('.f' . $fieldNum);
 673          $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount);
 674      }
 675  
 676      /**
 677       * Returns normalization factor for specified documents
 678       *
 679       * @param integer $id
 680       * @param string $fieldName
 681       * @return float
 682       */
 683      public function norm($id, $fieldName)
 684      {
 685          $fieldNum = $this->getFieldNum($fieldName);
 686  
 687          if ( !($this->_fields[$fieldNum]->isIndexed) ) {
 688              return null;
 689          }
 690  
 691          if (!isset($this->_norms[$fieldNum])) {
 692              $this->_loadNorm($fieldNum);
 693          }
 694  
 695          return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum]{$id}) );
 696      }
 697  
 698      /**
 699       * Returns norm vector, encoded in a byte string
 700       *
 701       * @param string $fieldName
 702       * @return string
 703       */
 704      public function normVector($fieldName)
 705      {
 706          $fieldNum = $this->getFieldNum($fieldName);
 707  
 708          if ($fieldNum == -1  ||  !($this->_fields[$fieldNum]->isIndexed)) {
 709              $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
 710  
 711              return str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
 712                                $this->_docCount);
 713          }
 714  
 715          if (!isset($this->_norms[$fieldNum])) {
 716              $this->_loadNorm($fieldNum);
 717          }
 718  
 719          return $this->_norms[$fieldNum];
 720      }
 721  
 722  
 723      /**
 724       * Returns true if any documents have been deleted from this index segment.
 725       *
 726       * @return boolean
 727       */
 728      public function hasDeletions()
 729      {
 730          return $this->_deleted !== null;
 731      }
 732  
 733  
 734      /**
 735       * Deletes a document from the index segment.
 736       * $id is an internal document id
 737       *
 738       * @param integer
 739       */
 740      public function delete($id)
 741      {
 742          $this->_deletedDirty = true;
 743  
 744          if (extension_loaded('bitset')) {
 745              if ($this->_deleted === null) {
 746                  $this->_deleted = bitset_empty($id);
 747              }
 748              bitset_incl($this->_deleted, $id);
 749          } else {
 750              if ($this->_deleted === null) {
 751                  $this->_deleted = array();
 752              }
 753  
 754              $this->_deleted[$id] = 1;
 755          }
 756      }
 757  
 758      /**
 759       * Checks, that document is deleted
 760       *
 761       * @param integer
 762       * @return boolean
 763       */
 764      public function isDeleted($id)
 765      {
 766          if ($this->_deleted === null) {
 767              return false;
 768          }
 769  
 770          if (extension_loaded('bitset')) {
 771              return bitset_in($this->_deleted, $id);
 772          } else {
 773              return isset($this->_deleted[$id]);
 774          }
 775      }
 776  
 777  
 778      /**
 779       * Write changes if it's necessary.
 780       */
 781      public function writeChanges()
 782      {
 783          if (!$this->_deletedDirty) {
 784              return;
 785          }
 786  
 787          if (extension_loaded('bitset')) {
 788              $delBytes = $this->_deleted;
 789              $bitCount = count(bitset_to_array($delBytes));
 790          } else {
 791              $byteCount = floor($this->_docCount/8)+1;
 792              $delBytes = str_repeat(chr(0), $byteCount);
 793              for ($count = 0; $count < $byteCount; $count++) {
 794                  $byte = 0;
 795                  for ($bit = 0; $bit < 8; $bit++) {
 796                      if (isset($this->_deleted[$count*8 + $bit])) {
 797                          $byte |= (1<<$bit);
 798                      }
 799                  }
 800                  $delBytes{$count} = chr($byte);
 801              }
 802              $bitCount = count($this->_deleted);
 803          }
 804  
 805  
 806          $delFile = $this->_directory->createFile($this->_name . '.del');
 807          $delFile->writeInt($this->_docCount);
 808          $delFile->writeInt($bitCount);
 809          $delFile->writeBytes($delBytes);
 810  
 811          $this->_deletedDirty = false;
 812      }
 813  
 814  
 815  
 816      /**
 817       * Term Dictionary File object for stream like terms reading
 818       *
 819       * @var Zend_Search_Lucene_Storage_File
 820       */
 821      private $_tisFile = null;
 822  
 823      /**
 824       * Frequencies File object for stream like terms reading
 825       *
 826       * @var Zend_Search_Lucene_Storage_File
 827       */
 828      private $_frqFile = null;
 829  
 830      /**
 831       * Offset of the .frq file in the compound file
 832       *
 833       * @var integer
 834       */
 835      private $_frqFileOffset;
 836  
 837      /**
 838       * Positions File object for stream like terms reading
 839       *
 840       * @var Zend_Search_Lucene_Storage_File
 841       */
 842      private $_prxFile = null;
 843  
 844      /**
 845       * Offset of the .prx file in the compound file
 846       *
 847       * @var integer
 848       */
 849      private $_prxFileOffset;
 850  
 851  
 852      /**
 853       * Number of terms in term stream
 854       *
 855       * @var integer
 856       */
 857      private $_termCount = 0;
 858  
 859      /**
 860       * Segment skip interval
 861       *
 862       * @var integer
 863       */
 864      private $_skipInterval;
 865  
 866      /**
 867       * Last TermInfo in a terms stream
 868       *
 869       * @var Zend_Search_Lucene_Index_TermInfo
 870       */
 871      private $_lastTermInfo = null;
 872  
 873      /**
 874       * Last Term in a terms stream
 875       *
 876       * @var Zend_Search_Lucene_Index_Term
 877       */
 878      private $_lastTerm = null;
 879  
 880      /**
 881       * Map of the document IDs
 882       * Used to get new docID after removing deleted documents.
 883       * It's not very effective from memory usage point of view,
 884       * but much more faster, then other methods
 885       *
 886       * @var array|null
 887       */
 888      private $_docMap = null;
 889  
 890      /**
 891       * An array of all term positions in the documents.
 892       * Array structure: array( docId => array( pos1, pos2, ...), ...)
 893       *
 894       * @var array
 895       */
 896      private $_lastTermPositions;
 897  
 898      /**
 899       * Reset terms stream
 900       *
 901       * $startId - id for the fist document
 902       * $compact - remove deleted documents
 903       *
 904       * Returns start document id for the next segment
 905       *
 906       * @param integer $startId
 907       * @param boolean $compact
 908       * @throws Zend_Search_Lucene_Exception
 909       * @return integer
 910       */
 911      public function reset($startId = 0, $compact = false)
 912      {
 913          if ($this->_tisFile !== null) {
 914              $this->_tisFile = null;
 915          }
 916  
 917          $this->_tisFile = $this->openCompoundFile('.tis', false);
 918          $tiVersion = $this->_tisFile->readInt();
 919          if ($tiVersion != (int)0xFFFFFFFE) {
 920              throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
 921          }
 922  
 923          $this->_termCount    = $this->_tisFile->readLong();
 924                                 $this->_tisFile->readInt();  // Read Index interval
 925          $this->_skipInterval = $this->_tisFile->readInt();  // Read skip interval
 926  
 927          if ($this->_frqFile !== null) {
 928              $this->_frqFile = null;
 929          }
 930          $this->_frqFile = $this->openCompoundFile('.frq', false);
 931          $this->_frqFileOffset = $this->_frqFile->tell();
 932  
 933          if ($this->_prxFile !== null) {
 934              $this->_prxFile = null;
 935          }
 936          $this->_prxFile = $this->openCompoundFile('.prx', false);
 937          $this->_prxFileOffset = $this->_prxFile->tell();
 938  
 939          $this->_lastTerm     = new Zend_Search_Lucene_Index_Term('', -1);
 940          $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0);
 941  
 942          $this->_docMap = array();
 943          for ($count = 0; $count < $this->_docCount; $count++) {
 944              if (!$this->isDeleted($count)) {
 945                  $this->_docMap[$count] = $startId + ($compact ? count($this->_docMap) : $count);
 946              }
 947          }
 948  
 949          $this->nextTerm();
 950          return $startId + ($compact ? count($this->_docMap) : $this->_docCount);
 951      }
 952  
 953  
 954      /**
 955       * Scans terms dictionary and returns next term
 956       *
 957       * @return Zend_Search_Lucene_Index_Term|null
 958       */
 959      public function nextTerm()
 960      {
 961          if ($this->_tisFile === null  ||  $this->_termCount == 0) {
 962              $this->_lastTerm     = null;
 963              $this->_lastTermInfo = null;
 964  
 965              // may be necessary for "empty" segment
 966              $this->_tisFile = null;
 967              $this->_frqFile = null;
 968              $this->_prxFile = null;
 969  
 970              return null;
 971          }
 972  
 973          $termPrefixLength = $this->_tisFile->readVInt();
 974          $termSuffix       = $this->_tisFile->readString();
 975          $termFieldNum     = $this->_tisFile->readVInt();
 976          $termValue        = Zend_Search_Lucene_Index_Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix;
 977  
 978          $this->_lastTerm = new Zend_Search_Lucene_Index_Term($termValue, $this->_fields[$termFieldNum]->name);
 979  
 980          $docFreq     = $this->_tisFile->readVInt();
 981          $freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt();
 982          $proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt();
 983          if ($docFreq >= $this->_skipInterval) {
 984              $skipOffset = $this->_tisFile->readVInt();
 985          } else {
 986              $skipOffset = 0;
 987          }
 988  
 989          $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
 990  
 991  
 992          $this->_lastTermPositions = array();
 993  
 994          $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET);
 995          $freqs = array();   $docId = 0;
 996          for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) {
 997              $docDelta = $this->_frqFile->readVInt();
 998              if( $docDelta % 2 == 1 ) {
 999                  $docId += ($docDelta-1)/2;
1000                  $freqs[ $docId ] = 1;
1001              } else {
1002                  $docId += $docDelta/2;
1003                  $freqs[ $docId ] = $this->_frqFile->readVInt();
1004              }
1005          }
1006  
1007          $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET);
1008          foreach ($freqs as $docId => $freq) {
1009              $termPosition = 0;  $positions = array();
1010  
1011              for ($count = 0; $count < $freq; $count++ ) {
1012                  $termPosition += $this->_prxFile->readVInt();
1013                  $positions[] = $termPosition;
1014              }
1015  
1016              if (isset($this->_docMap[$docId])) {
1017                  $this->_lastTermPositions[$this->_docMap[$docId]] = $positions;
1018              }
1019          }
1020  
1021  
1022          $this->_termCount--;
1023          if ($this->_termCount == 0) {
1024              $this->_tisFile = null;
1025              $this->_frqFile = null;
1026              $this->_prxFile = null;
1027          }
1028  
1029          return $this->_lastTerm;
1030      }
1031  
1032  
1033      /**
1034       * Returns term in current position
1035       *
1036       * @param Zend_Search_Lucene_Index_Term $term
1037       * @return Zend_Search_Lucene_Index_Term|null
1038       */
1039      public function currentTerm()
1040      {
1041          return $this->_lastTerm;
1042      }
1043  
1044  
1045      /**
1046       * Returns an array of all term positions in the documents.
1047       * Return array structure: array( docId => array( pos1, pos2, ...), ...)
1048       *
1049       * @return array
1050       */
1051      public function currentTermPositions()
1052      {
1053          return $this->_lastTermPositions;
1054      }
1055  }
1056  


Generated: Wed Jan 14 11:33:29 2009 Cross-referenced by PHPXref 0.7