[ Index ]

PHP Cross Reference of Moodle 1.9.3 [Build 15-Oct-2008]

title

Body

[close]

/search/Zend/Search/Lucene/Index/ -> SegmentWriter.php (source)

   1  <?php
   2  /**
   3   * Zend Framework
   4   *
   5   * LICENSE
   6   *
   7   * This source file is subject to the new BSD license that is bundled
   8   * with this package in the file LICENSE.txt.
   9   * It is also available through the world-wide-web at this URL:
  10   * http://framework.zend.com/license/new-bsd
  11   * If you did not receive a copy of the license and are unable to
  12   * obtain it through the world-wide-web, please send an email
  13   * to license@zend.com so we can send you a copy immediately.
  14   *
  15   * @category   Zend
  16   * @package    Zend_Search_Lucene
  17   * @subpackage Index
  18   * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  19   * @license    http://framework.zend.com/license/new-bsd     New BSD License
  20   */
  21  
  22  
  23  /** Zend_Search_Lucene_Exception */
  24  require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
  25  
  26  /** Zend_Search_Lucene_Index_SegmentInfo */
  27  require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php';
  28  
  29  
  30  /**
  31   * @category   Zend
  32   * @package    Zend_Search_Lucene
  33   * @subpackage Index
  34   * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  35   * @license    http://framework.zend.com/license/new-bsd     New BSD License
  36   */
  37  abstract class Zend_Search_Lucene_Index_SegmentWriter
  38  {
  39      /**
  40       * Expert: The fraction of terms in the "dictionary" which should be stored
  41       * in RAM.  Smaller values use more memory, but make searching slightly
  42       * faster, while larger values use less memory and make searching slightly
  43       * slower.  Searching is typically not dominated by dictionary lookup, so
  44       * tweaking this is rarely useful.
  45       *
  46       * @var integer
  47       */
  48      public static $indexInterval = 128;
  49  
  50      /** Expert: The fraction of TermDocs entries stored in skip tables.
  51       * Larger values result in smaller indexes, greater acceleration, but fewer
  52       * accelerable cases, while smaller values result in bigger indexes,
  53       * less acceleration and more
  54       * accelerable cases. More detailed experiments would be useful here.
  55       *
  56       * 0x0x7FFFFFFF indicates that we don't use skip data
  57       * Default value is 16
  58       *
  59       * @var integer
  60       */
  61      public static $skipInterval = 0x7FFFFFFF;
  62  
  63      /**
  64       * Number of docs in a segment
  65       *
  66       * @var integer
  67       */
  68      protected $_docCount = 0;
  69  
  70      /**
  71       * Segment name
  72       *
  73       * @var string
  74       */
  75      protected $_name;
  76  
  77      /**
  78       * File system adapter.
  79       *
  80       * @var Zend_Search_Lucene_Storage_Directory
  81       */
  82      protected $_directory;
  83  
  84      /**
  85       * List of the index files.
  86       * Used for automatic compound file generation
  87       *
  88       * @var unknown_type
  89       */
  90      protected $_files = array();
  91  
  92      /**
  93       * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
  94       *
  95       * @var array
  96       */
  97      protected $_fields = array();
  98  
  99      /**
 100       * Normalization factors.
 101       * An array fieldName => normVector
 102       * normVector is a binary string.
 103       * Each byte corresponds to an indexed document in a segment and
 104       * encodes normalization factor (float value, encoded by
 105       * Zend_Search_Lucene_Search_Similarity::encodeNorm())
 106       *
 107       * @var array
 108       */
 109      protected $_norms = array();
 110  
 111  
 112      /**
 113       * '.fdx'  file - Stored Fields, the field index.
 114       *
 115       * @var Zend_Search_Lucene_Storage_File
 116       */
 117      protected $_fdxFile = null;
 118  
 119      /**
 120       * '.fdt'  file - Stored Fields, the field data.
 121       *
 122       * @var Zend_Search_Lucene_Storage_File
 123       */
 124      protected $_fdtFile = null;
 125  
 126  
 127      /**
 128       * Object constructor.
 129       *
 130       * @param Zend_Search_Lucene_Storage_Directory $directory
 131       * @param string $name
 132       */
 133      public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
 134      {
 135          $this->_directory = $directory;
 136          $this->_name      = $name;
 137      }
 138  
 139  
 140      /**
 141       * Add field to the segment
 142       *
 143       * Returns actual field number
 144       *
 145       * @param Zend_Search_Lucene_Field $field
 146       * @return integer
 147       */
 148      public function addField(Zend_Search_Lucene_Field $field)
 149      {
 150          if (!isset($this->_fields[$field->name])) {
 151              $fieldNumber = count($this->_fields);
 152              $this->_fields[$field->name] =
 153                                  new Zend_Search_Lucene_Index_FieldInfo($field->name,
 154                                                                         $field->isIndexed,
 155                                                                         $fieldNumber,
 156                                                                         $field->storeTermVector);
 157  
 158              return $fieldNumber;
 159          } else {
 160              $this->_fields[$field->name]->isIndexed       |= $field->isIndexed;
 161              $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
 162  
 163              return $this->_fields[$field->name]->number;
 164          }
 165      }
 166  
 167      /**
 168       * Add fieldInfo to the segment
 169       *
 170       * Returns actual field number
 171       *
 172       * @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo
 173       * @return integer
 174       */
 175      public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo)
 176      {
 177          if (!isset($this->_fields[$fieldInfo->name])) {
 178              $fieldNumber = count($this->_fields);
 179              $this->_fields[$fieldInfo->name] =
 180                                  new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name,
 181                                                                         $fieldInfo->isIndexed,
 182                                                                         $fieldNumber,
 183                                                                         $fieldInfo->storeTermVector);
 184  
 185              return $fieldNumber;
 186          } else {
 187              $this->_fields[$fieldInfo->name]->isIndexed       |= $fieldInfo->isIndexed;
 188              $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector;
 189  
 190              return $this->_fields[$fieldInfo->name]->number;
 191          }
 192      }
 193  
 194      /**
 195       * Returns array of FieldInfo objects.
 196       *
 197       * @return array
 198       */
 199      public function getFieldInfos()
 200      {
 201          return $this->_fields;
 202      }
 203  
 204      /**
 205       * Add stored fields information
 206       *
 207       * @param array $storedFields array of Zend_Search_Lucene_Field objects
 208       */
 209      public function addStoredFields($storedFields)
 210      {
 211          if (!isset($this->_fdxFile)) {
 212              $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
 213              $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
 214  
 215              $this->_files[] = $this->_name . '.fdx';
 216              $this->_files[] = $this->_name . '.fdt';
 217          }
 218  
 219          $this->_fdxFile->writeLong($this->_fdtFile->tell());
 220          $this->_fdtFile->writeVInt(count($storedFields));
 221          foreach ($storedFields as $field) {
 222              $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
 223              $fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
 224                           ($field->isBinary ?    0x02 : 0x00) |
 225                           0x00; /* 0x04 - third bit, compressed (ZLIB) */
 226              $this->_fdtFile->writeByte($fieldBits);
 227              if ($field->isBinary) {
 228                  $this->_fdtFile->writeVInt(strlen($field->value));
 229                  $this->_fdtFile->writeBytes($field->value);
 230              } else {
 231                  $this->_fdtFile->writeString($field->getUtf8Value());
 232              }
 233          }
 234  
 235          $this->_docCount++;
 236      }
 237  
 238      /**
 239       * Returns the total number of documents in this segment.
 240       *
 241       * @return integer
 242       */
 243      public function count()
 244      {
 245          return $this->_docCount;
 246      }
 247  
 248      /**
 249       * Dump Field Info (.fnm) segment file
 250       */
 251      protected function _dumpFNM()
 252      {
 253          $fnmFile = $this->_directory->createFile($this->_name . '.fnm');
 254          $fnmFile->writeVInt(count($this->_fields));
 255  
 256          foreach ($this->_fields as $field) {
 257              $fnmFile->writeString($field->name);
 258              $fnmFile->writeByte(($field->isIndexed       ? 0x01 : 0x00) |
 259                                  ($field->storeTermVector ? 0x02 : 0x00)
 260  // not supported yet            0x04 /* term positions are stored with the term vectors */ |
 261  // not supported yet            0x08 /* term offsets are stored with the term vectors */   |
 262                                 );
 263  
 264              if ($field->isIndexed) {
 265                  $normFileName = $this->_name . '.f' . $field->number;
 266                  $fFile = $this->_directory->createFile($normFileName);
 267                  $fFile->writeBytes($this->_norms[$field->name]);
 268                  $this->_files[] = $normFileName;
 269              }
 270          }
 271  
 272          $this->_files[] = $this->_name . '.fnm';
 273      }
 274  
 275  
 276  
 277      /**
 278       * Term Dictionary file
 279       *
 280       * @var Zend_Search_Lucene_Storage_File
 281       */
 282      private $_tisFile = null;
 283  
 284      /**
 285       * Term Dictionary index file
 286       *
 287       * @var Zend_Search_Lucene_Storage_File
 288       */
 289      private $_tiiFile = null;
 290  
 291      /**
 292       * Frequencies file
 293       *
 294       * @var Zend_Search_Lucene_Storage_File
 295       */
 296      private $_frqFile = null;
 297  
 298      /**
 299       * Positions file
 300       *
 301       * @var Zend_Search_Lucene_Storage_File
 302       */
 303      private $_prxFile = null;
 304  
 305      /**
 306       * Number of written terms
 307       *
 308       * @var integer
 309       */
 310      private $_termCount;
 311  
 312  
 313      /**
 314       * Last saved term
 315       *
 316       * @var Zend_Search_Lucene_Index_Term
 317       */
 318      private $_prevTerm;
 319  
 320      /**
 321       * Last saved term info
 322       *
 323       * @var Zend_Search_Lucene_Index_TermInfo
 324       */
 325      private $_prevTermInfo;
 326  
 327      /**
 328       * Last saved index term
 329       *
 330       * @var Zend_Search_Lucene_Index_Term
 331       */
 332      private $_prevIndexTerm;
 333  
 334      /**
 335       * Last saved index term info
 336       *
 337       * @var Zend_Search_Lucene_Index_TermInfo
 338       */
 339      private $_prevIndexTermInfo;
 340  
 341      /**
 342       * Last term dictionary file position
 343       *
 344       * @var integer
 345       */
 346      private $_lastIndexPosition;
 347  
 348      /**
 349       * Create dicrionary, frequency and positions files and write necessary headers
 350       */
 351      public function initializeDictionaryFiles()
 352      {
 353          $this->_tisFile = $this->_directory->createFile($this->_name . '.tis');
 354          $this->_tisFile->writeInt((int)0xFFFFFFFE);
 355          $this->_tisFile->writeLong(0 /* dummy data for terms count */);
 356          $this->_tisFile->writeInt(self::$indexInterval);
 357          $this->_tisFile->writeInt(self::$skipInterval);
 358  
 359          $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii');
 360          $this->_tiiFile->writeInt((int)0xFFFFFFFE);
 361          $this->_tiiFile->writeLong(0 /* dummy data for terms count */);
 362          $this->_tiiFile->writeInt(self::$indexInterval);
 363          $this->_tiiFile->writeInt(self::$skipInterval);
 364  
 365          /** Dump dictionary header */
 366          $this->_tiiFile->writeVInt(0);                    // preffix length
 367          $this->_tiiFile->writeString('');                 // suffix
 368          $this->_tiiFile->writeInt((int)0xFFFFFFFF);       // field number
 369          $this->_tiiFile->writeByte((int)0x0F);
 370          $this->_tiiFile->writeVInt(0);                    // DocFreq
 371          $this->_tiiFile->writeVInt(0);                    // FreqDelta
 372          $this->_tiiFile->writeVInt(0);                    // ProxDelta
 373          $this->_tiiFile->writeVInt(20);                   // IndexDelta
 374  
 375          $this->_frqFile = $this->_directory->createFile($this->_name . '.frq');
 376          $this->_prxFile = $this->_directory->createFile($this->_name . '.prx');
 377  
 378          $this->_files[] = $this->_name . '.tis';
 379          $this->_files[] = $this->_name . '.tii';
 380          $this->_files[] = $this->_name . '.frq';
 381          $this->_files[] = $this->_name . '.prx';
 382  
 383          $this->_prevTerm          = null;
 384          $this->_prevTermInfo      = null;
 385          $this->_prevIndexTerm     = null;
 386          $this->_prevIndexTermInfo = null;
 387          $this->_lastIndexPosition = 20;
 388          $this->_termCount         = 0;
 389  
 390      }
 391  
 392      /**
 393       * Add term
 394       *
 395       * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
 396       *
 397       * @param Zend_Search_Lucene_Index_Term $termEntry
 398       * @param array $termDocs
 399       */
 400      public function addTerm($termEntry, $termDocs)
 401      {
 402          $freqPointer = $this->_frqFile->tell();
 403          $proxPointer = $this->_prxFile->tell();
 404  
 405          $prevDoc = 0;
 406          foreach ($termDocs as $docId => $termPositions) {
 407              $docDelta = ($docId - $prevDoc)*2;
 408              $prevDoc = $docId;
 409              if (count($termPositions) > 1) {
 410                  $this->_frqFile->writeVInt($docDelta);
 411                  $this->_frqFile->writeVInt(count($termPositions));
 412              } else {
 413                  $this->_frqFile->writeVInt($docDelta + 1);
 414              }
 415  
 416              $prevPosition = 0;
 417              foreach ($termPositions as $position) {
 418                  $this->_prxFile->writeVInt($position - $prevPosition);
 419                  $prevPosition = $position;
 420              }
 421          }
 422  
 423          if (count($termDocs) >= self::$skipInterval) {
 424              /**
 425               * @todo Write Skip Data to a freq file.
 426               * It's not used now, but make index more optimal
 427               */
 428              $skipOffset = $this->_frqFile->tell() - $freqPointer;
 429          } else {
 430              $skipOffset = 0;
 431          }
 432  
 433          $term = new Zend_Search_Lucene_Index_Term($termEntry->text,
 434                                                    $this->_fields[$termEntry->field]->number);
 435          $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs),
 436                                                            $freqPointer, $proxPointer, $skipOffset);
 437  
 438          $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);
 439  
 440          if (($this->_termCount + 1) % self::$indexInterval == 0) {
 441              $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);
 442  
 443              $indexPosition = $this->_tisFile->tell();
 444              $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
 445              $this->_lastIndexPosition = $indexPosition;
 446  
 447          }
 448          $this->_termCount++;
 449      }
 450  
 451      /**
 452       * Close dictionary
 453       */
 454      public function closeDictionaryFiles()
 455      {
 456          $this->_tisFile->seek(4);
 457          $this->_tisFile->writeLong($this->_termCount);
 458  
 459          $this->_tiiFile->seek(4);
 460          $this->_tiiFile->writeLong(ceil(($this->_termCount + 2)/self::$indexInterval));
 461      }
 462  
 463  
 464      /**
 465       * Dump Term Dictionary segment file entry.
 466       * Used to write entry to .tis or .tii files
 467       *
 468       * @param Zend_Search_Lucene_Storage_File $dicFile
 469       * @param Zend_Search_Lucene_Index_Term $prevTerm
 470       * @param Zend_Search_Lucene_Index_Term $term
 471       * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
 472       * @param Zend_Search_Lucene_Index_TermInfo $termInfo
 473       */
 474      protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
 475                                          &$prevTerm,     Zend_Search_Lucene_Index_Term     $term,
 476                                          &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
 477      {
 478          if (isset($prevTerm) && $prevTerm->field == $term->field) {
 479              $matchedBytes = 0;
 480              $maxBytes = min(strlen($prevTerm->text), strlen($term->text));
 481              while ($matchedBytes < $maxBytes  &&
 482                     $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) {
 483                  $matchedBytes++;
 484              }
 485  
 486              // Calculate actual matched UTF-8 pattern
 487              $prefixBytes = 0;
 488              $prefixChars = 0;
 489              while ($prefixBytes < $matchedBytes) {
 490                  $charBytes = 1;
 491                  if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) {
 492                      $charBytes++;
 493                      if (ord($term->text[$prefixBytes]) & 0x20 ) {
 494                          $charBytes++;
 495                          if (ord($term->text[$prefixBytes]) & 0x10 ) {
 496                              $charBytes++;
 497                          }
 498                      }
 499                  }
 500  
 501                  if ($prefixBytes + $charBytes > $matchedBytes) {
 502                      // char crosses matched bytes boundary
 503                      // skip char
 504                      break;
 505                  }
 506  
 507                  $prefixChars++;
 508                  $prefixBytes += $charBytes;
 509              }
 510  
 511              // Write preffix length
 512              $dicFile->writeVInt($prefixChars);
 513              // Write suffix
 514              $dicFile->writeString(substr($term->text, $prefixBytes));
 515          } else {
 516              // Write preffix length
 517              $dicFile->writeVInt(0);
 518              // Write suffix
 519              $dicFile->writeString($term->text);
 520          }
 521          // Write field number
 522          $dicFile->writeVInt($term->field);
 523          // DocFreq (the count of documents which contain the term)
 524          $dicFile->writeVInt($termInfo->docFreq);
 525  
 526          $prevTerm = $term;
 527  
 528          if (!isset($prevTermInfo)) {
 529              // Write FreqDelta
 530              $dicFile->writeVInt($termInfo->freqPointer);
 531              // Write ProxDelta
 532              $dicFile->writeVInt($termInfo->proxPointer);
 533          } else {
 534              // Write FreqDelta
 535              $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
 536              // Write ProxDelta
 537              $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
 538          }
 539          // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
 540          if ($termInfo->skipOffset != 0) {
 541              $dicFile->writeVInt($termInfo->skipOffset);
 542          }
 543  
 544          $prevTermInfo = $termInfo;
 545      }
 546  
 547  
 548      /**
 549       * Generate compound index file
 550       */
 551      protected function _generateCFS()
 552      {
 553          $cfsFile = $this->_directory->createFile($this->_name . '.cfs');
 554          $cfsFile->writeVInt(count($this->_files));
 555  
 556          $dataOffsetPointers = array();
 557          foreach ($this->_files as $fileName) {
 558              $dataOffsetPointers[$fileName] = $cfsFile->tell();
 559              $cfsFile->writeLong(0); // write dummy data
 560              $cfsFile->writeString($fileName);
 561          }
 562  
 563          foreach ($this->_files as $fileName) {
 564              // Get actual data offset
 565              $dataOffset = $cfsFile->tell();
 566              // Seek to the data offset pointer
 567              $cfsFile->seek($dataOffsetPointers[$fileName]);
 568              // Write actual data offset value
 569              $cfsFile->writeLong($dataOffset);
 570              // Seek back to the end of file
 571              $cfsFile->seek($dataOffset);
 572  
 573              $dataFile = $this->_directory->getFileObject($fileName);
 574  
 575              $byteCount = $this->_directory->fileLength($fileName);
 576              while ($byteCount > 0) {
 577                  $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/));
 578                  $byteCount -= strlen($data);
 579                  $cfsFile->writeBytes($data);
 580              }
 581  
 582              $this->_directory->deleteFile($fileName);
 583          }
 584      }
 585  
 586  
 587      /**
 588       * Close segment, write it to disk and return segment info
 589       *
 590       * @return Zend_Search_Lucene_Index_SegmentInfo
 591       */
 592      abstract public function close();
 593  }
 594  


Generated: Wed Jan 14 11:33:29 2009 Cross-referenced by PHPXref 0.7