Viewing file: SegmentWriter.php (19.97 KB) -rw-rw-rw- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
<?php /** * Zend Framework * * LICENSE * * This source file is subject to the new BSD license that is bundled * with this package in the file LICENSE.txt. * It is also available through the world-wide-web at this URL: * http://framework.zend.com/license/new-bsd * If you did not receive a copy of the license and are unable to * obtain it through the world-wide-web, please send an email * to license@zend.com so we can send you a copy immediately. * * @category Zend * @package Zend_Search_Lucene * @subpackage Index * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License * @version $Id: SegmentWriter.php 18947 2009-11-12 11:57:17Z alexander $ */
/** Zend_Search_Lucene_Index_FieldInfo */ require_once 'Zend/Search/Lucene/Index/FieldInfo.php';
/** Zend_Search_Lucene_Index_Term */ require_once 'Zend/Search/Lucene/Index/Term.php';
/** Zend_Search_Lucene_Index_TermInfo */ require_once 'Zend/Search/Lucene/Index/TermInfo.php';
/** * @category Zend * @package Zend_Search_Lucene * @subpackage Index * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ abstract class Zend_Search_Lucene_Index_SegmentWriter { /** * Expert: The fraction of terms in the "dictionary" which should be stored * in RAM. Smaller values use more memory, but make searching slightly * faster, while larger values use less memory and make searching slightly * slower. Searching is typically not dominated by dictionary lookup, so * tweaking this is rarely useful. * * @var integer */ public static $indexInterval = 128;
/** * Expert: The fraction of TermDocs entries stored in skip tables. * Larger values result in smaller indexes, greater acceleration, but fewer * accelerable cases, while smaller values result in bigger indexes, * less acceleration and more * accelerable cases. More detailed experiments would be useful here. * * 0x7FFFFFFF indicates that we don't use skip data * * Note: not used in current implementation * * @var integer */ public static $skipInterval = 0x7FFFFFFF;
/** * Expert: The maximum number of skip levels. Smaller values result in * slightly smaller indexes, but slower skipping in big posting lists. * * 0 indicates that we don't use skip data * * Note: not used in current implementation * * @var integer */ public static $maxSkipLevels = 0;
/** * Number of docs in a segment * * @var integer */ protected $_docCount = 0;
/** * Segment name * * @var string */ protected $_name;
/** * File system adapter. * * @var Zend_Search_Lucene_Storage_Directory */ protected $_directory;
/** * List of the index files. * Used for automatic compound file generation * * @var unknown_type */ protected $_files = array();
/** * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment * * @var array */ protected $_fields = array();
/** * Normalization factors. * An array fieldName => normVector * normVector is a binary string. * Each byte corresponds to an indexed document in a segment and * encodes normalization factor (float value, encoded by * Zend_Search_Lucene_Search_Similarity::encodeNorm()) * * @var array */ protected $_norms = array();
/** * '.fdx' file - Stored Fields, the field index. * * @var Zend_Search_Lucene_Storage_File */ protected $_fdxFile = null;
/** * '.fdt' file - Stored Fields, the field data. * * @var Zend_Search_Lucene_Storage_File */ protected $_fdtFile = null;
/** * Object constructor. * * @param Zend_Search_Lucene_Storage_Directory $directory * @param string $name */ public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name) { $this->_directory = $directory; $this->_name = $name; }
/** * Add field to the segment * * Returns actual field number * * @param Zend_Search_Lucene_Field $field * @return integer */ public function addField(Zend_Search_Lucene_Field $field) { if (!isset($this->_fields[$field->name])) { $fieldNumber = count($this->_fields); $this->_fields[$field->name] = new Zend_Search_Lucene_Index_FieldInfo($field->name, $field->isIndexed, $fieldNumber, $field->storeTermVector);
return $fieldNumber; } else { $this->_fields[$field->name]->isIndexed |= $field->isIndexed; $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
return $this->_fields[$field->name]->number; } }
/** * Add fieldInfo to the segment * * Returns actual field number * * @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo * @return integer */ public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo) { if (!isset($this->_fields[$fieldInfo->name])) { $fieldNumber = count($this->_fields); $this->_fields[$fieldInfo->name] = new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name, $fieldInfo->isIndexed, $fieldNumber, $fieldInfo->storeTermVector);
return $fieldNumber; } else { $this->_fields[$fieldInfo->name]->isIndexed |= $fieldInfo->isIndexed; $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector;
return $this->_fields[$fieldInfo->name]->number; } }
/** * Returns array of FieldInfo objects. * * @return array */ public function getFieldInfos() { return $this->_fields; }
/** * Add stored fields information * * @param array $storedFields array of Zend_Search_Lucene_Field objects */ public function addStoredFields($storedFields) { if (!isset($this->_fdxFile)) { $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx'); $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
$this->_files[] = $this->_name . '.fdx'; $this->_files[] = $this->_name . '.fdt'; }
$this->_fdxFile->writeLong($this->_fdtFile->tell()); $this->_fdtFile->writeVInt(count($storedFields)); foreach ($storedFields as $field) { $this->_fdtFile->writeVInt($this->_fields[$field->name]->number); $fieldBits = ($field->isTokenized ? 0x01 : 0x00) | ($field->isBinary ? 0x02 : 0x00) | 0x00; /* 0x04 - third bit, compressed (ZLIB) */ $this->_fdtFile->writeByte($fieldBits); if ($field->isBinary) { $this->_fdtFile->writeVInt(strlen($field->value)); $this->_fdtFile->writeBytes($field->value); } else { $this->_fdtFile->writeString($field->getUtf8Value()); } }
$this->_docCount++; }
/** * Returns the total number of documents in this segment. * * @return integer */ public function count() { return $this->_docCount; }
/** * Return segment name * * @return string */ public function getName() { return $this->_name; }
/** * Dump Field Info (.fnm) segment file */ protected function _dumpFNM() { $fnmFile = $this->_directory->createFile($this->_name . '.fnm'); $fnmFile->writeVInt(count($this->_fields));
$nrmFile = $this->_directory->createFile($this->_name . '.nrm'); // Write header $nrmFile->writeBytes('NRM'); // Write format specifier $nrmFile->writeByte((int)0xFF);
foreach ($this->_fields as $field) { $fnmFile->writeString($field->name); $fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) | ($field->storeTermVector ? 0x02 : 0x00) // not supported yet 0x04 /* term positions are stored with the term vectors */ | // not supported yet 0x08 /* term offsets are stored with the term vectors */ | );
if ($field->isIndexed) { // pre-2.1 index mode (not used now) // $normFileName = $this->_name . '.f' . $field->number; // $fFile = $this->_directory->createFile($normFileName); // $fFile->writeBytes($this->_norms[$field->name]); // $this->_files[] = $normFileName;
$nrmFile->writeBytes($this->_norms[$field->name]); } }
$this->_files[] = $this->_name . '.fnm'; $this->_files[] = $this->_name . '.nrm'; }
/** * Term Dictionary file * * @var Zend_Search_Lucene_Storage_File */ private $_tisFile = null;
/** * Term Dictionary index file * * @var Zend_Search_Lucene_Storage_File */ private $_tiiFile = null;
/** * Frequencies file * * @var Zend_Search_Lucene_Storage_File */ private $_frqFile = null;
/** * Positions file * * @var Zend_Search_Lucene_Storage_File */ private $_prxFile = null;
/** * Number of written terms * * @var integer */ private $_termCount;
/** * Last saved term * * @var Zend_Search_Lucene_Index_Term */ private $_prevTerm;
/** * Last saved term info * * @var Zend_Search_Lucene_Index_TermInfo */ private $_prevTermInfo;
/** * Last saved index term * * @var Zend_Search_Lucene_Index_Term */ private $_prevIndexTerm;
/** * Last saved index term info * * @var Zend_Search_Lucene_Index_TermInfo */ private $_prevIndexTermInfo;
/** * Last term dictionary file position * * @var integer */ private $_lastIndexPosition;
/** * Create dicrionary, frequency and positions files and write necessary headers */ public function initializeDictionaryFiles() { $this->_tisFile = $this->_directory->createFile($this->_name . '.tis'); $this->_tisFile->writeInt((int)0xFFFFFFFD); $this->_tisFile->writeLong(0 /* dummy data for terms count */); $this->_tisFile->writeInt(self::$indexInterval); $this->_tisFile->writeInt(self::$skipInterval); $this->_tisFile->writeInt(self::$maxSkipLevels);
$this->_tiiFile = $this->_directory->createFile($this->_name . '.tii'); $this->_tiiFile->writeInt((int)0xFFFFFFFD); $this->_tiiFile->writeLong(0 /* dummy data for terms count */); $this->_tiiFile->writeInt(self::$indexInterval); $this->_tiiFile->writeInt(self::$skipInterval); $this->_tiiFile->writeInt(self::$maxSkipLevels);
/** Dump dictionary header */ $this->_tiiFile->writeVInt(0); // preffix length $this->_tiiFile->writeString(''); // suffix $this->_tiiFile->writeInt((int)0xFFFFFFFF); // field number $this->_tiiFile->writeByte((int)0x0F); $this->_tiiFile->writeVInt(0); // DocFreq $this->_tiiFile->writeVInt(0); // FreqDelta $this->_tiiFile->writeVInt(0); // ProxDelta $this->_tiiFile->writeVInt(24); // IndexDelta
$this->_frqFile = $this->_directory->createFile($this->_name . '.frq'); $this->_prxFile = $this->_directory->createFile($this->_name . '.prx');
$this->_files[] = $this->_name . '.tis'; $this->_files[] = $this->_name . '.tii'; $this->_files[] = $this->_name . '.frq'; $this->_files[] = $this->_name . '.prx';
$this->_prevTerm = null; $this->_prevTermInfo = null; $this->_prevIndexTerm = null; $this->_prevIndexTermInfo = null; $this->_lastIndexPosition = 24; $this->_termCount = 0;
}
/** * Add term * * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... ) * * @param Zend_Search_Lucene_Index_Term $termEntry * @param array $termDocs */ public function addTerm($termEntry, $termDocs) { $freqPointer = $this->_frqFile->tell(); $proxPointer = $this->_prxFile->tell();
$prevDoc = 0; foreach ($termDocs as $docId => $termPositions) { $docDelta = ($docId - $prevDoc)*2; $prevDoc = $docId; if (count($termPositions) > 1) { $this->_frqFile->writeVInt($docDelta); $this->_frqFile->writeVInt(count($termPositions)); } else { $this->_frqFile->writeVInt($docDelta + 1); }
$prevPosition = 0; foreach ($termPositions as $position) { $this->_prxFile->writeVInt($position - $prevPosition); $prevPosition = $position; } }
if (count($termDocs) >= self::$skipInterval) { /** * @todo Write Skip Data to a freq file. * It's not used now, but make index more optimal */ $skipOffset = $this->_frqFile->tell() - $freqPointer; } else { $skipOffset = 0; }
$term = new Zend_Search_Lucene_Index_Term($termEntry->text, $this->_fields[$termEntry->field]->number); $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs), $freqPointer, $proxPointer, $skipOffset);
$this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);
if (($this->_termCount + 1) % self::$indexInterval == 0) { $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);
$indexPosition = $this->_tisFile->tell(); $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition); $this->_lastIndexPosition = $indexPosition;
} $this->_termCount++; }
/** * Close dictionary */ public function closeDictionaryFiles() { $this->_tisFile->seek(4); $this->_tisFile->writeLong($this->_termCount);
$this->_tiiFile->seek(4); // + 1 is used to count an additional special index entry (empty term at the start of the list) $this->_tiiFile->writeLong(($this->_termCount - $this->_termCount % self::$indexInterval)/self::$indexInterval + 1); }
/** * Dump Term Dictionary segment file entry. * Used to write entry to .tis or .tii files * * @param Zend_Search_Lucene_Storage_File $dicFile * @param Zend_Search_Lucene_Index_Term $prevTerm * @param Zend_Search_Lucene_Index_Term $term * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo * @param Zend_Search_Lucene_Index_TermInfo $termInfo */ protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile, &$prevTerm, Zend_Search_Lucene_Index_Term $term, &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo) { if (isset($prevTerm) && $prevTerm->field == $term->field) { $matchedBytes = 0; $maxBytes = min(strlen($prevTerm->text), strlen($term->text)); while ($matchedBytes < $maxBytes && $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) { $matchedBytes++; }
// Calculate actual matched UTF-8 pattern $prefixBytes = 0; $prefixChars = 0; while ($prefixBytes < $matchedBytes) { $charBytes = 1; if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) { $charBytes++; if (ord($term->text[$prefixBytes]) & 0x20 ) { $charBytes++; if (ord($term->text[$prefixBytes]) & 0x10 ) { $charBytes++; } } }
if ($prefixBytes + $charBytes > $matchedBytes) { // char crosses matched bytes boundary // skip char break; }
$prefixChars++; $prefixBytes += $charBytes; }
// Write preffix length $dicFile->writeVInt($prefixChars); // Write suffix $dicFile->writeString(substr($term->text, $prefixBytes)); } else { // Write preffix length $dicFile->writeVInt(0); // Write suffix $dicFile->writeString($term->text); } // Write field number $dicFile->writeVInt($term->field); // DocFreq (the count of documents which contain the term) $dicFile->writeVInt($termInfo->docFreq);
$prevTerm = $term;
if (!isset($prevTermInfo)) { // Write FreqDelta $dicFile->writeVInt($termInfo->freqPointer); // Write ProxDelta $dicFile->writeVInt($termInfo->proxPointer); } else { // Write FreqDelta $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer); // Write ProxDelta $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer); } // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval if ($termInfo->skipOffset != 0) { $dicFile->writeVInt($termInfo->skipOffset); }
$prevTermInfo = $termInfo; }
/** * Generate compound index file */ protected function _generateCFS() { $cfsFile = $this->_directory->createFile($this->_name . '.cfs'); $cfsFile->writeVInt(count($this->_files));
$dataOffsetPointers = array(); foreach ($this->_files as $fileName) { $dataOffsetPointers[$fileName] = $cfsFile->tell(); $cfsFile->writeLong(0); // write dummy data $cfsFile->writeString($fileName); }
foreach ($this->_files as $fileName) { // Get actual data offset $dataOffset = $cfsFile->tell(); // Seek to the data offset pointer $cfsFile->seek($dataOffsetPointers[$fileName]); // Write actual data offset value $cfsFile->writeLong($dataOffset); // Seek back to the end of file $cfsFile->seek($dataOffset);
$dataFile = $this->_directory->getFileObject($fileName);
$byteCount = $this->_directory->fileLength($fileName); while ($byteCount > 0) { $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/)); $byteCount -= strlen($data); $cfsFile->writeBytes($data); }
$this->_directory->deleteFile($fileName); } }
/** * Close segment, write it to disk and return segment info * * @return Zend_Search_Lucene_Index_SegmentInfo */ abstract public function close(); }
|