[ Index ]

PHP Cross Reference of Moodle 1.9.3 [Build 15-Oct-2008]

title

Body

[close]

/search/Zend/Search/Lucene/Search/Query/ -> Phrase.php (source)

   1  <?php
   2  /**
   3   * Zend Framework
   4   *
   5   * LICENSE
   6   *
   7   * This source file is subject to the new BSD license that is bundled
   8   * with this package in the file LICENSE.txt.
   9   * It is also available through the world-wide-web at this URL:
  10   * http://framework.zend.com/license/new-bsd
  11   * If you did not receive a copy of the license and are unable to
  12   * obtain it through the world-wide-web, please send an email
  13   * to license@zend.com so we can send you a copy immediately.
  14   *
  15   * @category   Zend
  16   * @package    Zend_Search_Lucene
  17   * @subpackage Search
  18   * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  19   * @license    http://framework.zend.com/license/new-bsd     New BSD License
  20   */
  21  
  22  
  23  /**
  24   * Zend_Search_Lucene_Search_Query
  25   */
  26  require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Query.php';
  27  
  28  /**
  29   * Zend_Search_Lucene_Search_Weight_MultiTerm
  30   */
  31  require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Weight/Phrase.php';
  32  
  33  
  34  /**
  35   * A Query that matches documents containing a particular sequence of terms.
  36   *
  37   * @category   Zend
  38   * @package    Zend_Search_Lucene
  39   * @subpackage Search
  40   * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  41   * @license    http://framework.zend.com/license/new-bsd     New BSD License
  42   */
  43  class Zend_Search_Lucene_Search_Query_Phrase extends Zend_Search_Lucene_Search_Query
  44  {
  45      /**
  46       * Terms to find.
  47       * Array of Zend_Search_Lucene_Index_Term objects.
  48       *
  49       * @var array
  50       */
  51      private $_terms;
  52  
  53      /**
  54       * Term positions (relative positions of terms within the phrase).
  55       * Array of integers
  56       *
  57       * @var array
  58       */
  59      private $_offsets;
  60  
  61      /**
  62       * Sets the number of other words permitted between words in query phrase.
  63       * If zero, then this is an exact phrase search.  For larger values this works
  64       * like a WITHIN or NEAR operator.
  65       *
  66       * The slop is in fact an edit-distance, where the units correspond to
  67       * moves of terms in the query phrase out of position.  For example, to switch
  68       * the order of two words requires two moves (the first move places the words
  69       * atop one another), so to permit re-orderings of phrases, the slop must be
  70       * at least two.
  71       * More exact matches are scored higher than sloppier matches, thus search
  72       * results are sorted by exactness.
  73       *
  74       * The slop is zero by default, requiring exact matches.
  75       *
  76       * @var integer
  77       */
  78      private $_slop;
  79  
  80      /**
  81       * Result vector.
  82       *
  83       * @var array
  84       */
  85      private $_resVector = null;
  86  
  87      /**
  88       * Terms positions vectors.
  89       * Array of Arrays:
  90       * term1Id => (docId => array( pos1, pos2, ... ), ...)
  91       * term2Id => (docId => array( pos1, pos2, ... ), ...)
  92       *
  93       * @var array
  94       */
  95      private $_termsPositions = array();
  96  
  97      /**
  98       * Class constructor.  Create a new prase query.
  99       *
 100       * @param string $field    Field to search.
 101       * @param array  $terms    Terms to search Array of strings.
 102       * @param array  $offsets  Relative term positions. Array of integers.
 103       * @throws Zend_Search_Lucene_Exception
 104       */
 105      public function __construct($terms = null, $offsets = null, $field = null)
 106      {
 107          $this->_slop = 0;
 108  
 109          if (is_array($terms)) {
 110              $this->_terms = array();
 111              foreach ($terms as $termId => $termText) {
 112                  $this->_terms[$termId] = ($field !== null)? new Zend_Search_Lucene_Index_Term($termText, $field):
 113                                                              new Zend_Search_Lucene_Index_Term($termText);
 114              }
 115          } else if ($terms === null) {
 116              $this->_terms = array();
 117          } else {
 118              throw new Zend_Search_Lucene_Exception('terms argument must be array of strings or null');
 119          }
 120  
 121          if (is_array($offsets)) {
 122              if (count($this->_terms) != count($offsets)) {
 123                  throw new Zend_Search_Lucene_Exception('terms and offsets arguments must have the same size.');
 124              }
 125              $this->_offsets = $offsets;
 126          } else if ($offsets === null) {
 127              $this->_offsets = array();
 128              foreach ($this->_terms as $termId => $term) {
 129                  $position = count($this->_offsets);
 130                  $this->_offsets[$termId] = $position;
 131              }
 132          } else {
 133              throw new Zend_Search_Lucene_Exception('offsets argument must be array of strings or null');
 134          }
 135      }
 136  
 137      /**
 138       * Set slop
 139       *
 140       * @param integer $slop
 141       */
 142      public function setSlop($slop)
 143      {
 144          $this->_slop = $slop;
 145      }
 146  
 147  
 148      /**
 149       * Get slop
 150       *
 151       * @return integer
 152       */
 153      public function getSlop()
 154      {
 155          return $this->_slop;
 156      }
 157  
 158  
 159      /**
 160       * Adds a term to the end of the query phrase.
 161       * The relative position of the term is specified explicitly or the one immediately
 162       * after the last term added.
 163       *
 164       * @param Zend_Search_Lucene_Index_Term $term
 165       * @param integer $position
 166       */
 167      public function addTerm(Zend_Search_Lucene_Index_Term $term, $position = null) {
 168          if ((count($this->_terms) != 0)&&(end($this->_terms)->field != $term->field)) {
 169              throw new Zend_Search_Lucene_Exception('All phrase terms must be in the same field: ' .
 170                                                     $term->field . ':' . $term->text);
 171          }
 172  
 173          $this->_terms[] = $term;
 174          if ($position !== null) {
 175              $this->_offsets[] = $position;
 176          } else if (count($this->_offsets) != 0) {
 177              $this->_offsets[] = end($this->_offsets) + 1;
 178          } else {
 179              $this->_offsets[] = 0;
 180          }
 181      }
 182  
 183  
 184      /**
 185       * Re-write query into primitive queries in the context of specified index
 186       *
 187       * @param Zend_Search_Lucene_Interface $index
 188       * @return Zend_Search_Lucene_Search_Query
 189       */
 190      public function rewrite(Zend_Search_Lucene_Interface $index)
 191      {
 192          if (count($this->_terms) == 0) {
 193              return new Zend_Search_Lucene_Search_Query_Empty();
 194          } else if ($this->_terms[0]->field !== null) {
 195              return $this;
 196          } else {
 197              $query = new Zend_Search_Lucene_Search_Query_Boolean();
 198              $query->setBoost($this->getBoost());
 199  
 200              foreach ($index->getFieldNames(true) as $fieldName) {
 201                  $subquery = new Zend_Search_Lucene_Search_Query_Phrase();
 202                  $subquery->setSlop($this->getSlop());
 203  
 204                  foreach ($this->_terms as $termId => $term) {
 205                      $qualifiedTerm = new Zend_Search_Lucene_Index_Term($term->text, $fieldName);
 206  
 207                      $subquery->addTerm($qualifiedTerm, $this->_offsets[$termId]);
 208                  }
 209  
 210                  $query->addSubquery($subquery);
 211              }
 212  
 213              return $query;
 214          }
 215      }
 216  
 217      /**
 218       * Optimize query in the context of specified index
 219       *
 220       * @param Zend_Search_Lucene_Interface $index
 221       * @return Zend_Search_Lucene_Search_Query
 222       */
 223      public function optimize(Zend_Search_Lucene_Interface $index)
 224      {
 225          // Check, that index contains all phrase terms
 226          foreach ($this->_terms as $term) {
 227              if (!$index->hasTerm($term)) {
 228                  return new Zend_Search_Lucene_Search_Query_Empty();
 229              }
 230          }
 231  
 232          if (count($this->_terms) == 1) {
 233              // It's one term query
 234              $optimizedQuery = new Zend_Search_Lucene_Search_Query_Term(reset($this->_terms));
 235              $optimizedQuery->setBoost($this->getBoost());
 236  
 237              return $optimizedQuery;
 238          }
 239  
 240          if (count($this->_terms) == 0) {
 241              return new Zend_Search_Lucene_Search_Query_Empty();
 242          }
 243  
 244  
 245          return $this;
 246      }
 247  
 248      /**
 249       * Returns query term
 250       *
 251       * @return array
 252       */
 253      public function getTerms()
 254      {
 255          return $this->_terms;
 256      }
 257  
 258  
 259      /**
 260       * Set weight for specified term
 261       *
 262       * @param integer $num
 263       * @param Zend_Search_Lucene_Search_Weight_Term $weight
 264       */
 265      public function setWeight($num, $weight)
 266      {
 267          $this->_weights[$num] = $weight;
 268      }
 269  
 270  
 271      /**
 272       * Constructs an appropriate Weight implementation for this query.
 273       *
 274       * @param Zend_Search_Lucene_Interface $reader
 275       * @return Zend_Search_Lucene_Search_Weight
 276       */
 277      public function createWeight(Zend_Search_Lucene_Interface $reader)
 278      {
 279          $this->_weight = new Zend_Search_Lucene_Search_Weight_Phrase($this, $reader);
 280          return $this->_weight;
 281      }
 282  
 283  
 284      /**
 285       * Score calculator for exact phrase queries (terms sequence is fixed)
 286       *
 287       * @param integer $docId
 288       * @return float
 289       */
 290      public function _exactPhraseFreq($docId)
 291      {
 292          $freq = 0;
 293  
 294          // Term Id with lowest cardinality
 295          $lowCardTermId = null;
 296  
 297          // Calculate $lowCardTermId
 298          foreach ($this->_terms as $termId => $term) {
 299              if ($lowCardTermId === null ||
 300                  count($this->_termsPositions[$termId][$docId]) <
 301                  count($this->_termsPositions[$lowCardTermId][$docId]) ) {
 302                      $lowCardTermId = $termId;
 303                  }
 304          }
 305  
 306          // Walk through positions of the term with lowest cardinality
 307          foreach ($this->_termsPositions[$lowCardTermId][$docId] as $lowCardPos) {
 308              // We expect phrase to be found
 309              $freq++;
 310  
 311              // Walk through other terms
 312              foreach ($this->_terms as $termId => $term) {
 313                  if ($termId != $lowCardTermId) {
 314                      $expectedPosition = $lowCardPos +
 315                                              ($this->_offsets[$termId] -
 316                                               $this->_offsets[$lowCardTermId]);
 317  
 318                      if (!in_array($expectedPosition, $this->_termsPositions[$termId][$docId])) {
 319                          $freq--;  // Phrase wasn't found.
 320                          break;
 321                      }
 322                  }
 323              }
 324          }
 325  
 326          return $freq;
 327      }
 328  
 329      /**
 330       * Score calculator for sloppy phrase queries (terms sequence is fixed)
 331       *
 332       * @param integer $docId
 333       * @param Zend_Search_Lucene_Interface $reader
 334       * @return float
 335       */
 336      public function _sloppyPhraseFreq($docId, Zend_Search_Lucene_Interface $reader)
 337      {
 338          $freq = 0;
 339  
 340          $phraseQueue = array();
 341          $phraseQueue[0] = array(); // empty phrase
 342          $lastTerm = null;
 343  
 344          // Walk through the terms to create phrases.
 345          foreach ($this->_terms as $termId => $term) {
 346              $queueSize = count($phraseQueue);
 347              $firstPass = true;
 348  
 349              // Walk through the term positions.
 350              // Each term position produces a set of phrases.
 351              foreach ($this->_termsPositions[$termId][$docId] as $termPosition ) {
 352                  if ($firstPass) {
 353                      for ($count = 0; $count < $queueSize; $count++) {
 354                          $phraseQueue[$count][$termId] = $termPosition;
 355                      }
 356                  } else {
 357                      for ($count = 0; $count < $queueSize; $count++) {
 358                          if ($lastTerm !== null &&
 359                              abs( $termPosition - $phraseQueue[$count][$lastTerm] -
 360                                   ($this->_offsets[$termId] - $this->_offsets[$lastTerm])) > $this->_slop) {
 361                              continue;
 362                          }
 363  
 364                          $newPhraseId = count($phraseQueue);
 365                          $phraseQueue[$newPhraseId]          = $phraseQueue[$count];
 366                          $phraseQueue[$newPhraseId][$termId] = $termPosition;
 367                      }
 368  
 369                  }
 370  
 371                  $firstPass = false;
 372              }
 373              $lastTerm = $termId;
 374          }
 375  
 376  
 377          foreach ($phraseQueue as $phrasePos) {
 378              $minDistance = null;
 379  
 380              for ($shift = -$this->_slop; $shift <= $this->_slop; $shift++) {
 381                  $distance = 0;
 382                  $start = reset($phrasePos) - reset($this->_offsets) + $shift;
 383  
 384                  foreach ($this->_terms as $termId => $term) {
 385                      $distance += abs($phrasePos[$termId] - $this->_offsets[$termId] - $start);
 386  
 387                      if($distance > $this->_slop) {
 388                          break;
 389                      }
 390                  }
 391  
 392                  if ($minDistance === null || $distance < $minDistance) {
 393                      $minDistance = $distance;
 394                  }
 395              }
 396  
 397              if ($minDistance <= $this->_slop) {
 398                  $freq += $reader->getSimilarity()->sloppyFreq($minDistance);
 399              }
 400          }
 401  
 402          return $freq;
 403      }
 404  
 405      /**
 406       * Execute query in context of index reader
 407       * It also initializes necessary internal structures
 408       *
 409       * @param Zend_Search_Lucene_Interface $reader
 410       */
 411      public function execute(Zend_Search_Lucene_Interface $reader)
 412      {
 413          $this->_resVector = null;
 414  
 415          if (count($this->_terms) == 0) {
 416              $this->_resVector = array();
 417          }
 418  
 419          foreach( $this->_terms as $termId=>$term ) {
 420              if($this->_resVector === null) {
 421                  $this->_resVector = array_flip($reader->termDocs($term));
 422              } else {
 423                  $this->_resVector = array_intersect_key($this->_resVector, array_flip($reader->termDocs($term)));
 424              }
 425  
 426              if (count($this->_resVector) == 0) {
 427                  // Empty result set, we don't need to check other terms
 428                  break;
 429              }
 430  
 431              $this->_termsPositions[$termId] = $reader->termPositions($term);
 432          }
 433  
 434          ksort($this->_resVector, SORT_NUMERIC);
 435  
 436          // Initialize weight if it's not done yet
 437          $this->_initWeight($reader);
 438      }
 439  
 440      /**
 441       * Get document ids likely matching the query
 442       *
 443       * It's an array with document ids as keys (performance considerations)
 444       *
 445       * @return array
 446       */
 447      public function matchedDocs()
 448      {
 449          return $this->_resVector;
 450      }
 451  
 452      /**
 453       * Score specified document
 454       *
 455       * @param integer $docId
 456       * @param Zend_Search_Lucene_Interface $reader
 457       * @return float
 458       */
 459      public function score($docId, Zend_Search_Lucene_Interface $reader)
 460      {
 461          if (isset($this->_resVector[$docId])) {
 462              if ($this->_slop == 0) {
 463                  $freq = $this->_exactPhraseFreq($docId);
 464              } else {
 465                  $freq = $this->_sloppyPhraseFreq($docId, $reader);
 466              }
 467  
 468              if ($freq != 0) {
 469                  $tf = $reader->getSimilarity()->tf($freq);
 470                  $weight = $this->_weight->getValue();
 471                  $norm = $reader->norm($docId, reset($this->_terms)->field);
 472  
 473                  return $tf * $weight * $norm * $this->getBoost();
 474              }
 475  
 476              // Included in result, but culculated freq is zero
 477              return 0;
 478          } else {
 479              return 0;
 480          }
 481      }
 482  
 483      /**
 484       * Return query terms
 485       *
 486       * @return array
 487       */
 488      public function getQueryTerms()
 489      {
 490          return $this->_terms;
 491      }
 492  
 493      /**
 494       * Highlight query terms
 495       *
 496       * @param integer &$colorIndex
 497       * @param Zend_Search_Lucene_Document_Html $doc
 498       */
 499      public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
 500      {
 501          $words = array();
 502          foreach ($this->_terms as $term) {
 503              $words[] = $term->text;
 504          }
 505  
 506          $doc->highlight($words, $this->_getHighlightColor($colorIndex));
 507      }
 508  
 509      /**
 510       * Print a query
 511       *
 512       * @return string
 513       */
 514      public function __toString()
 515      {
 516          // It's used only for query visualisation, so we don't care about characters escaping
 517  
 518          $query = '';
 519  
 520          if (isset($this->_terms[0]) && $this->_terms[0]->field !== null) {
 521              $query .= $this->_terms[0]->field . ':';
 522          }
 523  
 524          $query .= '"';
 525  
 526          foreach ($this->_terms as $id => $term) {
 527              if ($id != 0) {
 528                  $query .= ' ';
 529              }
 530              $query .= $term->text;
 531          }
 532  
 533          $query .= '"';
 534  
 535          if ($this->_slop != 0) {
 536              $query .= '~' . $this->_slop;
 537          }
 538  
 539          return $query;
 540      }
 541  }
 542  


Generated: Wed Jan 14 11:33:29 2009 Cross-referenced by PHPXref 0.7