[ Index ]

PHP Cross Reference of Moodle 1.9.3 [Build 15-Oct-2008]

title

Body

[close]

/search/Zend/Search/Lucene/Search/ -> Similarity.php (source)

   1  <?php
   2  /**
   3   * Zend Framework
   4   *
   5   * LICENSE
   6   *
   7   * This source file is subject to the new BSD license that is bundled
   8   * with this package in the file LICENSE.txt.
   9   * It is also available through the world-wide-web at this URL:
  10   * http://framework.zend.com/license/new-bsd
  11   * If you did not receive a copy of the license and are unable to
  12   * obtain it through the world-wide-web, please send an email
  13   * to license@zend.com so we can send you a copy immediately.
  14   *
  15   * @category   Zend
  16   * @package    Zend_Search_Lucene
  17   * @subpackage Search
  18   * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  19   * @license    http://framework.zend.com/license/new-bsd     New BSD License
  20   */
  21  
  22  
  23  /** Zend_Search_Lucene_Search_Similarity_Default */
  24  require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Search/Similarity/Default.php';
  25  
  26  
  27  /**
  28   * @category   Zend
  29   * @package    Zend_Search_Lucene
  30   * @subpackage Search
  31   * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  32   * @license    http://framework.zend.com/license/new-bsd     New BSD License
  33   */
  34  abstract class Zend_Search_Lucene_Search_Similarity
  35  {
  36      /**
  37       * The Similarity implementation used by default.
  38       *
  39       * @var Zend_Search_Lucene_Search_Similarity
  40       */
  41      private static $_defaultImpl;
  42  
  43      /**
  44       * Cache of decoded bytes.
  45       * Array of floats
  46       *
  47       * @var array
  48       */
  49      private static $_normTable = array( 0   => 0.0,
  50                                          1   => 5.820766E-10,
  51                                          2   => 6.9849193E-10,
  52                                          3   => 8.1490725E-10,
  53                                          4   => 9.313226E-10,
  54                                          5   => 1.1641532E-9,
  55                                          6   => 1.3969839E-9,
  56                                          7   => 1.6298145E-9,
  57                                          8   => 1.8626451E-9,
  58                                          9   => 2.3283064E-9,
  59                                          10  => 2.7939677E-9,
  60                                          11  => 3.259629E-9,
  61                                          12  => 3.7252903E-9,
  62                                          13  => 4.656613E-9,
  63                                          14  => 5.5879354E-9,
  64                                          15  => 6.519258E-9,
  65                                          16  => 7.4505806E-9,
  66                                          17  => 9.313226E-9,
  67                                          18  => 1.1175871E-8,
  68                                          19  => 1.3038516E-8,
  69                                          20  => 1.4901161E-8,
  70                                          21  => 1.8626451E-8,
  71                                          22  => 2.2351742E-8,
  72                                          23  => 2.6077032E-8,
  73                                          24  => 2.9802322E-8,
  74                                          25  => 3.7252903E-8,
  75                                          26  => 4.4703484E-8,
  76                                          27  => 5.2154064E-8,
  77                                          28  => 5.9604645E-8,
  78                                          29  => 7.4505806E-8,
  79                                          30  => 8.940697E-8,
  80                                          31  => 1.0430813E-7,
  81                                          32  => 1.1920929E-7,
  82                                          33  => 1.4901161E-7,
  83                                          34  => 1.7881393E-7,
  84                                          35  => 2.0861626E-7,
  85                                          36  => 2.3841858E-7,
  86                                          37  => 2.9802322E-7,
  87                                          38  => 3.5762787E-7,
  88                                          39  => 4.172325E-7,
  89                                          40  => 4.7683716E-7,
  90                                          41  => 5.9604645E-7,
  91                                          42  => 7.1525574E-7,
  92                                          43  => 8.34465E-7,
  93                                          44  => 9.536743E-7,
  94                                          45  => 1.1920929E-6,
  95                                          46  => 1.4305115E-6,
  96                                          47  => 1.66893E-6,
  97                                          48  => 1.9073486E-6,
  98                                          49  => 2.3841858E-6,
  99                                          50  => 2.861023E-6,
 100                                          51  => 3.33786E-6,
 101                                          52  => 3.8146973E-6,
 102                                          53  => 4.7683716E-6,
 103                                          54  => 5.722046E-6,
 104                                          55  => 6.67572E-6,
 105                                          56  => 7.6293945E-6,
 106                                          57  => 9.536743E-6,
 107                                          58  => 1.1444092E-5,
 108                                          59  => 1.335144E-5,
 109                                          60  => 1.5258789E-5,
 110                                          61  => 1.9073486E-5,
 111                                          62  => 2.2888184E-5,
 112                                          63  => 2.670288E-5,
 113                                          64  => 3.0517578E-5,
 114                                          65  => 3.8146973E-5,
 115                                          66  => 4.5776367E-5,
 116                                          67  => 5.340576E-5,
 117                                          68  => 6.1035156E-5,
 118                                          69  => 7.6293945E-5,
 119                                          70  => 9.1552734E-5,
 120                                          71  => 1.0681152E-4,
 121                                          72  => 1.2207031E-4,
 122                                          73  => 1.5258789E-4,
 123                                          74  => 1.8310547E-4,
 124                                          75  => 2.1362305E-4,
 125                                          76  => 2.4414062E-4,
 126                                          77  => 3.0517578E-4,
 127                                          78  => 3.6621094E-4,
 128                                          79  => 4.272461E-4,
 129                                          80  => 4.8828125E-4,
 130                                          81  => 6.1035156E-4,
 131                                          82  => 7.324219E-4,
 132                                          83  => 8.544922E-4,
 133                                          84  => 9.765625E-4,
 134                                          85  => 0.0012207031,
 135                                          86  => 0.0014648438,
 136                                          87  => 0.0017089844,
 137                                          88  => 0.001953125,
 138                                          89  => 0.0024414062,
 139                                          90  => 0.0029296875,
 140                                          91  => 0.0034179688,
 141                                          92  => 0.00390625,
 142                                          93  => 0.0048828125,
 143                                          94  => 0.005859375,
 144                                          95  => 0.0068359375,
 145                                          96  => 0.0078125,
 146                                          97  => 0.009765625,
 147                                          98  => 0.01171875,
 148                                          99  => 0.013671875,
 149                                          100 => 0.015625,
 150                                          101 => 0.01953125,
 151                                          102 => 0.0234375,
 152                                          103 => 0.02734375,
 153                                          104 => 0.03125,
 154                                          105 => 0.0390625,
 155                                          106 => 0.046875,
 156                                          107 => 0.0546875,
 157                                          108 => 0.0625,
 158                                          109 => 0.078125,
 159                                          110 => 0.09375,
 160                                          111 => 0.109375,
 161                                          112 => 0.125,
 162                                          113 => 0.15625,
 163                                          114 => 0.1875,
 164                                          115 => 0.21875,
 165                                          116 => 0.25,
 166                                          117 => 0.3125,
 167                                          118 => 0.375,
 168                                          119 => 0.4375,
 169                                          120 => 0.5,
 170                                          121 => 0.625,
 171                                          122 => 0.75,
 172                                          123 => 0.875,
 173                                          124 => 1.0,
 174                                          125 => 1.25,
 175                                          126 => 1.5,
 176                                          127 => 1.75,
 177                                          128 => 2.0,
 178                                          129 => 2.5,
 179                                          130 => 3.0,
 180                                          131 => 3.5,
 181                                          132 => 4.0,
 182                                          133 => 5.0,
 183                                          134 => 6.0,
 184                                          135 => 7.0,
 185                                          136 => 8.0,
 186                                          137 => 10.0,
 187                                          138 => 12.0,
 188                                          139 => 14.0,
 189                                          140 => 16.0,
 190                                          141 => 20.0,
 191                                          142 => 24.0,
 192                                          143 => 28.0,
 193                                          144 => 32.0,
 194                                          145 => 40.0,
 195                                          146 => 48.0,
 196                                          147 => 56.0,
 197                                          148 => 64.0,
 198                                          149 => 80.0,
 199                                          150 => 96.0,
 200                                          151 => 112.0,
 201                                          152 => 128.0,
 202                                          153 => 160.0,
 203                                          154 => 192.0,
 204                                          155 => 224.0,
 205                                          156 => 256.0,
 206                                          157 => 320.0,
 207                                          158 => 384.0,
 208                                          159 => 448.0,
 209                                          160 => 512.0,
 210                                          161 => 640.0,
 211                                          162 => 768.0,
 212                                          163 => 896.0,
 213                                          164 => 1024.0,
 214                                          165 => 1280.0,
 215                                          166 => 1536.0,
 216                                          167 => 1792.0,
 217                                          168 => 2048.0,
 218                                          169 => 2560.0,
 219                                          170 => 3072.0,
 220                                          171 => 3584.0,
 221                                          172 => 4096.0,
 222                                          173 => 5120.0,
 223                                          174 => 6144.0,
 224                                          175 => 7168.0,
 225                                          176 => 8192.0,
 226                                          177 => 10240.0,
 227                                          178 => 12288.0,
 228                                          179 => 14336.0,
 229                                          180 => 16384.0,
 230                                          181 => 20480.0,
 231                                          182 => 24576.0,
 232                                          183 => 28672.0,
 233                                          184 => 32768.0,
 234                                          185 => 40960.0,
 235                                          186 => 49152.0,
 236                                          187 => 57344.0,
 237                                          188 => 65536.0,
 238                                          189 => 81920.0,
 239                                          190 => 98304.0,
 240                                          191 => 114688.0,
 241                                          192 => 131072.0,
 242                                          193 => 163840.0,
 243                                          194 => 196608.0,
 244                                          195 => 229376.0,
 245                                          196 => 262144.0,
 246                                          197 => 327680.0,
 247                                          198 => 393216.0,
 248                                          199 => 458752.0,
 249                                          200 => 524288.0,
 250                                          201 => 655360.0,
 251                                          202 => 786432.0,
 252                                          203 => 917504.0,
 253                                          204 => 1048576.0,
 254                                          205 => 1310720.0,
 255                                          206 => 1572864.0,
 256                                          207 => 1835008.0,
 257                                          208 => 2097152.0,
 258                                          209 => 2621440.0,
 259                                          210 => 3145728.0,
 260                                          211 => 3670016.0,
 261                                          212 => 4194304.0,
 262                                          213 => 5242880.0,
 263                                          214 => 6291456.0,
 264                                          215 => 7340032.0,
 265                                          216 => 8388608.0,
 266                                          217 => 1.048576E7,
 267                                          218 => 1.2582912E7,
 268                                          219 => 1.4680064E7,
 269                                          220 => 1.6777216E7,
 270                                          221 => 2.097152E7,
 271                                          222 => 2.5165824E7,
 272                                          223 => 2.9360128E7,
 273                                          224 => 3.3554432E7,
 274                                          225 => 4.194304E7,
 275                                          226 => 5.0331648E7,
 276                                          227 => 5.8720256E7,
 277                                          228 => 6.7108864E7,
 278                                          229 => 8.388608E7,
 279                                          230 => 1.00663296E8,
 280                                          231 => 1.17440512E8,
 281                                          232 => 1.34217728E8,
 282                                          233 => 1.6777216E8,
 283                                          234 => 2.01326592E8,
 284                                          235 => 2.34881024E8,
 285                                          236 => 2.68435456E8,
 286                                          237 => 3.3554432E8,
 287                                          238 => 4.02653184E8,
 288                                          239 => 4.69762048E8,
 289                                          240 => 5.3687091E8,
 290                                          241 => 6.7108864E8,
 291                                          242 => 8.0530637E8,
 292                                          243 => 9.395241E8,
 293                                          244 => 1.07374182E9,
 294                                          245 => 1.34217728E9,
 295                                          246 => 1.61061274E9,
 296                                          247 => 1.87904819E9,
 297                                          248 => 2.14748365E9,
 298                                          249 => 2.68435456E9,
 299                                          250 => 3.22122547E9,
 300                                          251 => 3.75809638E9,
 301                                          252 => 4.2949673E9,
 302                                          253 => 5.3687091E9,
 303                                          254 => 6.4424509E9,
 304                                          255 => 7.5161928E9 );
 305  
 306  
 307      /**
 308       * Set the default Similarity implementation used by indexing and search
 309       * code.
 310       *
 311       * @param Zend_Search_Lucene_Search_Similarity $similarity
 312       */
 313      public static function setDefault(Zend_Search_Lucene_Search_Similarity $similarity)
 314      {
 315          self::$_defaultImpl = $similarity;
 316      }
 317  
 318  
 319      /**
 320       * Return the default Similarity implementation used by indexing and search
 321       * code.
 322       *
 323       * @return Zend_Search_Lucene_Search_Similarity
 324       */
 325      public static function getDefault()
 326      {
 327          if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) {
 328              self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default();
 329          }
 330  
 331          return self::$_defaultImpl;
 332      }
 333  
 334  
 335      /**
 336       * Computes the normalization value for a field given the total number of
 337       * terms contained in a field.  These values, together with field boosts, are
 338       * stored in an index and multipled into scores for hits on each field by the
 339       * search code.
 340       *
 341       * Matches in longer fields are less precise, so implemenations of this
 342       * method usually return smaller values when 'numTokens' is large,
 343       * and larger values when 'numTokens' is small.
 344       *
 345       * That these values are computed under
 346       * IndexWriter::addDocument(Document) and stored then using
 347       * encodeNorm(float).  Thus they have limited precision, and documents
 348       * must be re-indexed if this method is altered.
 349       *
 350       * fieldName - name of field
 351       * numTokens - the total number of tokens contained in fields named
 352       *             'fieldName' of 'doc'.
 353       * Returns a normalization factor for hits on this field of this document
 354       *
 355       * @param string $fieldName
 356       * @param integer $numTokens
 357       * @return float
 358       */
 359      abstract public function lengthNorm($fieldName, $numTokens);
 360  
 361      /**
 362       * Computes the normalization value for a query given the sum of the squared
 363       * weights of each of the query terms.  This value is then multipled into the
 364       * weight of each query term.
 365       *
 366       * This does not affect ranking, but rather just attempts to make scores
 367       * from different queries comparable.
 368       *
 369       * sumOfSquaredWeights - the sum of the squares of query term weights
 370       * Returns a normalization factor for query weights
 371       *
 372       * @param float $sumOfSquaredWeights
 373       * @return float
 374       */
 375      abstract public function queryNorm($sumOfSquaredWeights);
 376  
 377  
 378      /**
 379       *  Decodes a normalization factor stored in an index.
 380       *
 381       * @param integer $byte
 382       * @return float
 383       */
 384      public static function decodeNorm($byte)
 385      {
 386          return self::$_normTable[$byte & 0xFF];
 387      }
 388  
 389  
 390      /**
 391       * Encodes a normalization factor for storage in an index.
 392       *
 393       * The encoding uses a five-bit exponent and three-bit mantissa, thus
 394       * representing values from around 7x10^9 to 2x10^-9 with about one
 395       * significant decimal digit of accuracy.  Zero is also represented.
 396       * Negative numbers are rounded up to zero.  Values too large to represent
 397       * are rounded down to the largest representable value.  Positive values too
 398       * small to represent are rounded up to the smallest positive representable
 399       * value.
 400       *
 401       * @param float $f
 402       * @return integer
 403       */
 404      static function encodeNorm($f)
 405      {
 406        return self::_floatToByte($f);
 407      }
 408  
 409      /**
 410       * Float to byte conversion
 411       *
 412       * @param integer $b
 413       * @return float
 414       */
 415      private static function _floatToByte($f)
 416      {
 417          // round negatives up to zero
 418          if ($f <= 0.0) {
 419              return 0;
 420          }
 421  
 422          // search for appropriate value
 423          $lowIndex = 0;
 424          $highIndex = 255;
 425          while ($highIndex >= $lowIndex) {
 426              // $mid = ($highIndex - $lowIndex)/2;
 427              $mid = ($highIndex + $lowIndex) >> 1;
 428              $delta = $f - self::$_normTable[$mid];
 429  
 430              if ($delta < 0) {
 431                  $highIndex = $mid-1;
 432              } elseif ($delta > 0) {
 433                  $lowIndex  = $mid+1;
 434              } else {
 435                  return $mid; // We got it!
 436              }
 437          }
 438  
 439          // round to closest value
 440          if ($highIndex != 255 &&
 441              $f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) {
 442              return $highIndex + 1;
 443          } else {
 444              return $highIndex;
 445          }
 446      }
 447  
 448  
 449      /**
 450       * Computes a score factor based on a term or phrase's frequency in a
 451       * document.  This value is multiplied by the idf(Term, Searcher)
 452       * factor for each term in the query and these products are then summed to
 453       * form the initial score for a document.
 454       *
 455       * Terms and phrases repeated in a document indicate the topic of the
 456       * document, so implementations of this method usually return larger values
 457       * when 'freq' is large, and smaller values when 'freq'
 458       * is small.
 459       *
 460       * freq - the frequency of a term within a document
 461       * Returns a score factor based on a term's within-document frequency
 462       *
 463       * @param float $freq
 464       * @return float
 465       */
 466      abstract public function tf($freq);
 467  
 468      /**
 469       * Computes the amount of a sloppy phrase match, based on an edit distance.
 470       * This value is summed for each sloppy phrase match in a document to form
 471       * the frequency that is passed to tf(float).
 472       *
 473       * A phrase match with a small edit distance to a document passage more
 474       * closely matches the document, so implementations of this method usually
 475       * return larger values when the edit distance is small and smaller values
 476       * when it is large.
 477       *
 478       * distance - the edit distance of this sloppy phrase match
 479       * Returns the frequency increment for this match
 480       *
 481       * @param integer $distance
 482       * @return float
 483       */
 484      abstract public function sloppyFreq($distance);
 485  
 486  
 487      /**
 488       * Computes a score factor for a simple term or a phrase.
 489       *
 490       * The default implementation is:
 491       *   return idfFreq(searcher.docFreq(term), searcher.maxDoc());
 492       *
 493       * input - the term in question or array of terms
 494       * reader - reader the document collection being searched
 495       * Returns a score factor for the term
 496       *
 497       * @param mixed $input
 498       * @param Zend_Search_Lucene_Interface $reader
 499       * @return a score factor for the term
 500       */
 501      public function idf($input, Zend_Search_Lucene_Interface $reader)
 502      {
 503          if (!is_array($input)) {
 504              return $this->idfFreq($reader->docFreq($input), $reader->count());
 505          } else {
 506              $idf = 0.0;
 507              foreach ($input as $term) {
 508                  $idf += $this->idfFreq($reader->docFreq($term), $reader->count());
 509              }
 510              return $idf;
 511          }
 512      }
 513  
 514      /**
 515       * Computes a score factor based on a term's document frequency (the number
 516       * of documents which contain the term).  This value is multiplied by the
 517       * tf(int) factor for each term in the query and these products are
 518       * then summed to form the initial score for a document.
 519       *
 520       * Terms that occur in fewer documents are better indicators of topic, so
 521       * implemenations of this method usually return larger values for rare terms,
 522       * and smaller values for common terms.
 523       *
 524       * docFreq - the number of documents which contain the term
 525       * numDocs - the total number of documents in the collection
 526       * Returns a score factor based on the term's document frequency
 527       *
 528       * @param integer $docFreq
 529       * @param integer $numDocs
 530       * @return float
 531       */
 532      abstract public function idfFreq($docFreq, $numDocs);
 533  
 534      /**
 535       * Computes a score factor based on the fraction of all query terms that a
 536       * document contains.  This value is multiplied into scores.
 537       *
 538       * The presence of a large portion of the query terms indicates a better
 539       * match with the query, so implemenations of this method usually return
 540       * larger values when the ratio between these parameters is large and smaller
 541       * values when the ratio between them is small.
 542       *
 543       * overlap - the number of query terms matched in the document
 544       * maxOverlap - the total number of terms in the query
 545       * Returns a score factor based on term overlap with the query
 546       *
 547       * @param integer $overlap
 548       * @param integer $maxOverlap
 549       * @return float
 550       */
 551      abstract public function coord($overlap, $maxOverlap);
 552  }
 553  


Generated: Wed Jan 14 11:33:29 2009 Cross-referenced by PHPXref 0.7