Viewing file: Similarity.php (24.2 KB) -rw-rw-rw- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
<?php /** * Zend Framework * * LICENSE * * This source file is subject to the new BSD license that is bundled * with this package in the file LICENSE.txt. * It is also available through the world-wide-web at this URL: * http://framework.zend.com/license/new-bsd * If you did not receive a copy of the license and are unable to * obtain it through the world-wide-web, please send an email * to license@zend.com so we can send you a copy immediately. * * @category Zend * @package Zend_Search_Lucene * @subpackage Search * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License * @version $Id: Similarity.php 18954 2009-11-12 20:01:33Z alexander $ */
/** * @category Zend * @package Zend_Search_Lucene * @subpackage Search * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ abstract class Zend_Search_Lucene_Search_Similarity { /** * The Similarity implementation used by default. * * @var Zend_Search_Lucene_Search_Similarity */ private static $_defaultImpl;
/** * Cache of decoded bytes. * Array of floats * * @var array */ private static $_normTable = array( 0 => 0.0, 1 => 5.820766E-10, 2 => 6.9849193E-10, 3 => 8.1490725E-10, 4 => 9.313226E-10, 5 => 1.1641532E-9, 6 => 1.3969839E-9, 7 => 1.6298145E-9, 8 => 1.8626451E-9, 9 => 2.3283064E-9, 10 => 2.7939677E-9, 11 => 3.259629E-9, 12 => 3.7252903E-9, 13 => 4.656613E-9, 14 => 5.5879354E-9, 15 => 6.519258E-9, 16 => 7.4505806E-9, 17 => 9.313226E-9, 18 => 1.1175871E-8, 19 => 1.3038516E-8, 20 => 1.4901161E-8, 21 => 1.8626451E-8, 22 => 2.2351742E-8, 23 => 2.6077032E-8, 24 => 2.9802322E-8, 25 => 3.7252903E-8, 26 => 4.4703484E-8, 27 => 5.2154064E-8, 28 => 5.9604645E-8, 29 => 7.4505806E-8, 30 => 8.940697E-8, 31 => 1.0430813E-7, 32 => 1.1920929E-7, 33 => 1.4901161E-7, 34 => 1.7881393E-7, 35 => 2.0861626E-7, 36 => 2.3841858E-7, 37 => 2.9802322E-7, 38 => 3.5762787E-7, 39 => 4.172325E-7, 40 => 4.7683716E-7, 41 => 5.9604645E-7, 42 => 7.1525574E-7, 43 => 8.34465E-7, 44 => 9.536743E-7, 45 => 1.1920929E-6, 46 => 1.4305115E-6, 47 => 1.66893E-6, 48 => 1.9073486E-6, 49 => 2.3841858E-6, 50 => 2.861023E-6, 51 => 3.33786E-6, 52 => 3.8146973E-6, 53 => 4.7683716E-6, 54 => 5.722046E-6, 55 => 6.67572E-6, 56 => 7.6293945E-6, 57 => 9.536743E-6, 58 => 1.1444092E-5, 59 => 1.335144E-5, 60 => 1.5258789E-5, 61 => 1.9073486E-5, 62 => 2.2888184E-5, 63 => 2.670288E-5, 64 => 3.0517578E-5, 65 => 3.8146973E-5, 66 => 4.5776367E-5, 67 => 5.340576E-5, 68 => 6.1035156E-5, 69 => 7.6293945E-5, 70 => 9.1552734E-5, 71 => 1.0681152E-4, 72 => 1.2207031E-4, 73 => 1.5258789E-4, 74 => 1.8310547E-4, 75 => 2.1362305E-4, 76 => 2.4414062E-4, 77 => 3.0517578E-4, 78 => 3.6621094E-4, 79 => 4.272461E-4, 80 => 4.8828125E-4, 81 => 6.1035156E-4, 82 => 7.324219E-4, 83 => 8.544922E-4, 84 => 9.765625E-4, 85 => 0.0012207031, 86 => 0.0014648438, 87 => 0.0017089844, 88 => 0.001953125, 89 => 0.0024414062, 90 => 0.0029296875, 91 => 0.0034179688, 92 => 0.00390625, 93 => 0.0048828125, 94 => 0.005859375, 95 => 0.0068359375, 96 => 0.0078125, 97 => 0.009765625, 98 => 0.01171875, 99 => 0.013671875, 100 => 0.015625, 101 => 0.01953125, 102 => 0.0234375, 103 => 0.02734375, 104 => 0.03125, 105 => 0.0390625, 106 => 0.046875, 107 => 0.0546875, 108 => 0.0625, 109 => 0.078125, 110 => 0.09375, 111 => 0.109375, 112 => 0.125, 113 => 0.15625, 114 => 0.1875, 115 => 0.21875, 116 => 0.25, 117 => 0.3125, 118 => 0.375, 119 => 0.4375, 120 => 0.5, 121 => 0.625, 122 => 0.75, 123 => 0.875, 124 => 1.0, 125 => 1.25, 126 => 1.5, 127 => 1.75, 128 => 2.0, 129 => 2.5, 130 => 3.0, 131 => 3.5, 132 => 4.0, 133 => 5.0, 134 => 6.0, 135 => 7.0, 136 => 8.0, 137 => 10.0, 138 => 12.0, 139 => 14.0, 140 => 16.0, 141 => 20.0, 142 => 24.0, 143 => 28.0, 144 => 32.0, 145 => 40.0, 146 => 48.0, 147 => 56.0, 148 => 64.0, 149 => 80.0, 150 => 96.0, 151 => 112.0, 152 => 128.0, 153 => 160.0, 154 => 192.0, 155 => 224.0, 156 => 256.0, 157 => 320.0, 158 => 384.0, 159 => 448.0, 160 => 512.0, 161 => 640.0, 162 => 768.0, 163 => 896.0, 164 => 1024.0, 165 => 1280.0, 166 => 1536.0, 167 => 1792.0, 168 => 2048.0, 169 => 2560.0, 170 => 3072.0, 171 => 3584.0, 172 => 4096.0, 173 => 5120.0, 174 => 6144.0, 175 => 7168.0, 176 => 8192.0, 177 => 10240.0, 178 => 12288.0, 179 => 14336.0, 180 => 16384.0, 181 => 20480.0, 182 => 24576.0, 183 => 28672.0, 184 => 32768.0, 185 => 40960.0, 186 => 49152.0, 187 => 57344.0, 188 => 65536.0, 189 => 81920.0, 190 => 98304.0, 191 => 114688.0, 192 => 131072.0, 193 => 163840.0, 194 => 196608.0, 195 => 229376.0, 196 => 262144.0, 197 => 327680.0, 198 => 393216.0, 199 => 458752.0, 200 => 524288.0, 201 => 655360.0, 202 => 786432.0, 203 => 917504.0, 204 => 1048576.0, 205 => 1310720.0, 206 => 1572864.0, 207 => 1835008.0, 208 => 2097152.0, 209 => 2621440.0, 210 => 3145728.0, 211 => 3670016.0, 212 => 4194304.0, 213 => 5242880.0, 214 => 6291456.0, 215 => 7340032.0, 216 => 8388608.0, 217 => 1.048576E7, 218 => 1.2582912E7, 219 => 1.4680064E7, 220 => 1.6777216E7, 221 => 2.097152E7, 222 => 2.5165824E7, 223 => 2.9360128E7, 224 => 3.3554432E7, 225 => 4.194304E7, 226 => 5.0331648E7, 227 => 5.8720256E7, 228 => 6.7108864E7, 229 => 8.388608E7, 230 => 1.00663296E8, 231 => 1.17440512E8, 232 => 1.34217728E8, 233 => 1.6777216E8, 234 => 2.01326592E8, 235 => 2.34881024E8, 236 => 2.68435456E8, 237 => 3.3554432E8, 238 => 4.02653184E8, 239 => 4.69762048E8, 240 => 5.3687091E8, 241 => 6.7108864E8, 242 => 8.0530637E8, 243 => 9.395241E8, 244 => 1.07374182E9, 245 => 1.34217728E9, 246 => 1.61061274E9, 247 => 1.87904819E9, 248 => 2.14748365E9, 249 => 2.68435456E9, 250 => 3.22122547E9, 251 => 3.75809638E9, 252 => 4.2949673E9, 253 => 5.3687091E9, 254 => 6.4424509E9, 255 => 7.5161928E9 );
/** * Set the default Similarity implementation used by indexing and search * code. * * @param Zend_Search_Lucene_Search_Similarity $similarity */ public static function setDefault(Zend_Search_Lucene_Search_Similarity $similarity) { self::$_defaultImpl = $similarity; }
/** * Return the default Similarity implementation used by indexing and search * code. * * @return Zend_Search_Lucene_Search_Similarity */ public static function getDefault() { if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) { require_once 'Zend/Search/Lucene/Search/Similarity/Default.php'; self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default(); }
return self::$_defaultImpl; }
/** * Computes the normalization value for a field given the total number of * terms contained in a field. These values, together with field boosts, are * stored in an index and multipled into scores for hits on each field by the * search code. * * Matches in longer fields are less precise, so implemenations of this * method usually return smaller values when 'numTokens' is large, * and larger values when 'numTokens' is small. * * That these values are computed under * IndexWriter::addDocument(Document) and stored then using * encodeNorm(float). Thus they have limited precision, and documents * must be re-indexed if this method is altered. * * fieldName - name of field * numTokens - the total number of tokens contained in fields named * 'fieldName' of 'doc'. * Returns a normalization factor for hits on this field of this document * * @param string $fieldName * @param integer $numTokens * @return float */ abstract public function lengthNorm($fieldName, $numTokens);
/** * Computes the normalization value for a query given the sum of the squared * weights of each of the query terms. This value is then multipled into the * weight of each query term. * * This does not affect ranking, but rather just attempts to make scores * from different queries comparable. * * sumOfSquaredWeights - the sum of the squares of query term weights * Returns a normalization factor for query weights * * @param float $sumOfSquaredWeights * @return float */ abstract public function queryNorm($sumOfSquaredWeights);
/** * Decodes a normalization factor stored in an index. * * @param integer $byte * @return float */ public static function decodeNorm($byte) { return self::$_normTable[$byte & 0xFF]; }
/** * Encodes a normalization factor for storage in an index. * * The encoding uses a five-bit exponent and three-bit mantissa, thus * representing values from around 7x10^9 to 2x10^-9 with about one * significant decimal digit of accuracy. Zero is also represented. * Negative numbers are rounded up to zero. Values too large to represent * are rounded down to the largest representable value. Positive values too * small to represent are rounded up to the smallest positive representable * value. * * @param float $f * @return integer */ static function encodeNorm($f) { return self::_floatToByte($f); }
/** * Float to byte conversion * * @param integer $b * @return float */ private static function _floatToByte($f) { // round negatives up to zero if ($f <= 0.0) { return 0; }
// search for appropriate value $lowIndex = 0; $highIndex = 255; while ($highIndex >= $lowIndex) { // $mid = ($highIndex - $lowIndex)/2; $mid = ($highIndex + $lowIndex) >> 1; $delta = $f - self::$_normTable[$mid];
if ($delta < 0) { $highIndex = $mid-1; } elseif ($delta > 0) { $lowIndex = $mid+1; } else { return $mid; // We got it! } }
// round to closest value if ($highIndex != 255 && $f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) { return $highIndex + 1; } else { return $highIndex; } }
/** * Computes a score factor based on a term or phrase's frequency in a * document. This value is multiplied by the idf(Term, Searcher) * factor for each term in the query and these products are then summed to * form the initial score for a document. * * Terms and phrases repeated in a document indicate the topic of the * document, so implementations of this method usually return larger values * when 'freq' is large, and smaller values when 'freq' * is small. * * freq - the frequency of a term within a document * Returns a score factor based on a term's within-document frequency * * @param float $freq * @return float */ abstract public function tf($freq);
/** * Computes the amount of a sloppy phrase match, based on an edit distance. * This value is summed for each sloppy phrase match in a document to form * the frequency that is passed to tf(float). * * A phrase match with a small edit distance to a document passage more * closely matches the document, so implementations of this method usually * return larger values when the edit distance is small and smaller values * when it is large. * * distance - the edit distance of this sloppy phrase match * Returns the frequency increment for this match * * @param integer $distance * @return float */ abstract public function sloppyFreq($distance);
/** * Computes a score factor for a simple term or a phrase. * * The default implementation is: * return idfFreq(searcher.docFreq(term), searcher.maxDoc()); * * input - the term in question or array of terms * reader - reader the document collection being searched * Returns a score factor for the term * * @param mixed $input * @param Zend_Search_Lucene_Interface $reader * @return a score factor for the term */ public function idf($input, Zend_Search_Lucene_Interface $reader) { if (!is_array($input)) { return $this->idfFreq($reader->docFreq($input), $reader->count()); } else { $idf = 0.0; foreach ($input as $term) { $idf += $this->idfFreq($reader->docFreq($term), $reader->count()); } return $idf; } }
/** * Computes a score factor based on a term's document frequency (the number * of documents which contain the term). This value is multiplied by the * tf(int) factor for each term in the query and these products are * then summed to form the initial score for a document. * * Terms that occur in fewer documents are better indicators of topic, so * implemenations of this method usually return larger values for rare terms, * and smaller values for common terms. * * docFreq - the number of documents which contain the term * numDocs - the total number of documents in the collection * Returns a score factor based on the term's document frequency * * @param integer $docFreq * @param integer $numDocs * @return float */ abstract public function idfFreq($docFreq, $numDocs);
/** * Computes a score factor based on the fraction of all query terms that a * document contains. This value is multiplied into scores. * * The presence of a large portion of the query terms indicates a better * match with the query, so implemenations of this method usually return * larger values when the ratio between these parameters is large and smaller * values when the ratio between them is small. * * overlap - the number of query terms matched in the document * maxOverlap - the total number of terms in the query * Returns a score factor based on term overlap with the query * * @param integer $overlap * @param integer $maxOverlap * @return float */ abstract public function coord($overlap, $maxOverlap); }
|