<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2024 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2024 * @filesource */ namespace seekquarry\yioop\library\index_bundle_iterators; use seekquarry\yioop\library as L; use seekquarry\yioop\library\IndexDocumentBundle; use seekquarry\yioop\library\IndexManager; use seekquarry\yioop\library\IndexShard; /** * Used to iterate through all the documents and links associated with a * an IndexArchiveBundle. It iterates through each doc or link regardless of * the words it contains. It also makes it easy to get the summaries * of these documents. * * A description of how words and the documents containing them are stored * is given in the documentation of IndexArchiveBundle. * * @author Chris Pollett * @see IndexArchiveBundle */ class DocIterator extends IndexBundleIterator { /** * When results are access from $index_name in * which order they should be presented. self::ASCENDING is from first * added to last added, self::DESCENDING is from last added to first * added. * @var int */ public $direction; /** * @var array */ public $doc_map; /** * Index of the current generation/partition in the doc_map to get results * from * @var int */ public $doc_map_generation; /** * The timestamp of the index is associated with this iterator * @var string */ public $index_name; /** * @var int */ public $key_index; /** * The index version affects how the iterator cycles through documents * There was a big change in index format between version 3 and prior * formats * @var int */ public $index_version; /** * The next byte offset of a doc in the IndexShard * @var int */ public $next_offset; /** * Last offset of a doc occurrence in the IndexShard * @var int */ public $last_offset; /** * The current byte offset in the IndexShard * @var int */ public $current_offset; /** * How url, keywords, and title words should influence relevance * and doc rank calculations * @var array */ public $ranking_factors; /** * An array of shard docids_lens * @var array */ public $shard_lens; /** * The total number of shards that have data for this word * @var int */ public $num_generations; /** * Numeric number of current shard * @var int */ public $current_generation; /** * Model responsible for keeping track of edited and deleted search results * @var SearchfiltersModel */ public $filter; /** Host Key position + 1 (first char says doc, inlink or eternal link)*/ const HOST_KEY_POS = 17; /** Length of a doc key */ const KEY_LEN = 8; /** * Creates a doc iterator with the given parameters. * @param string $index_name time_stamp of the to use * @param SearchfiltersModel $filter Model responsible for keeping * track of edited and deleted search results * @param int $results_per_block number of results in a block of results * return in one go from the iterator * @param int $direction when results are access from $index_name in * which order they should be presented. self::ASCENDING is from first * added to last added, self::DESCENDING is from last added to first * added. Note: this value is not saved permanently. So you * could in theory open two read only versions of the same bundle but * reading the results in different directions * @param array $ranking_factors field says url being a host, cld, * or having a lot of slashes should affect its doc rank calculations */ public function __construct($index_name, $filter = null, $results_per_block = IndexBundleIterator::RESULTS_PER_BLOCK, $direction = self::ASCENDING, $ranking_factors = []) { $this->filter = $filter; $this->index_name = $index_name; $this->direction = $direction; $this->ranking_factors = $ranking_factors; $this->index_version = IndexManager::getVersion($index_name); $index = IndexManager::getIndex($index_name, $direction); if (empty($index)) { $this->num_docs = 0; $this->num_generations = -1; $this->results_per_block = $results_per_block; $this->current_block_fresh = false; $this->reset(); return; } $info = $index->getArchiveInfo($index->dir_name); if ($this->index_version < 3) { $this->num_docs = $info['COUNT']; $this->num_generations = (isset($index->generation_info['ACTIVE'])) ? $index->generation_info['ACTIVE'] + 1 : 0; } else { $this->num_docs = ($info['COUNT'] ?? 0) + ($info['ACTIVE'] ?? 0); $this->num_generations = $info['SAVE_PARTITION'] + 1; } $this->results_per_block = $results_per_block; $this->current_block_fresh = false; $this->reset(); } /** * Returns the iterators to the first document block that it could iterate * over */ public function reset() { $is_ascending = ($this->direction == self::ASCENDING); $this->current_generation = ($is_ascending) ? 0 : $this->num_generations - 1; $this->doc_map_generation = -1; $this->getGenerationInfo($this->current_generation); $this->count_block = 0; $this->seen_docs = 0; $this->current_offset = ($is_ascending) ? 0 : $this->getPreviousDocOffset($this->last_offset); $this->next_offset = $this->current_offset; } /** * Mainly used to get the last_offset in shard $generation of the * current index bundle. In the case where this wasn't previously * cached it loads in the index bundle, sets the current generation to * $generation, stores the docids_len (the last offset) of this shard * in shard_lens and sets up last_offset as $generation's docids_len * * @param $generation to get last offset for */ public function getGenerationInfo($generation) { if($this->num_generations <= 0) { return; } if ($this->index_version < 3 && isset($this->shard_lens[$generation])) { $this->last_offset = $this->shard_lens[$generation]; } else { $index = IndexManager::getIndex($this->index_name); if ($this->index_version < 3) { $index->setCurrentShard($generation, true); $shard = $index->getCurrentShard(); $this->last_offset = $shard->docids_len; $this->shard_lens[$generation] = $shard->docids_len; } else { if ($generation != $this->doc_map_generation) { $base_folder = $index->getPartitionBaseFolder( $this->current_generation); $doc_map_filename = $base_folder . "/" . IndexDocumentBundle::DOC_MAP_FILENAME; $doc_map_tools = $index->doc_map_tools; $this->doc_map = $doc_map_tools->load($doc_map_filename) ?? []; $doc_keys = array_keys($this->doc_map); $key_index = []; foreach ($this->doc_map as $key => $entry) { if (!$index->isType($key, "link")) { $key_index[] = $key; } } $this->key_index = $key_index; $this->last_offset = count($key_index) - 1; $this->doc_map_generation = $generation; } } } } /** * Hook function used by currentDocsWithWord to return the current block * of docs if it is not cached * * @return mixed doc ids and score if there are docs left, -1 otherwise */ public function findDocsWithWord() { if ($this->num_generations <= 0) { return []; } $is_ascending = ($this->direction == self::ASCENDING); if (($is_ascending && ($this->current_generation >= $this->num_generations) || ($this->current_generation == $this->num_generations - 1 && $this->current_offset > $this->last_offset)) || !$is_ascending && ($this->current_generation < 0) || ($this->current_generation == 0 && $this->current_offset < 0)) { return -1; } $pre_results = []; $this->next_offset = $this->current_offset; $index = IndexManager::getIndex($this->index_name); if ($this->index_version < 3) { $index->setCurrentShard($this->current_generation, true); //the next call also updates next offset $shard = $index->getCurrentShard(); $num_docs_or_links = ($this->index_version < 3) ? $shard->num_docs + $shard->num_link_docs : 0; $doc_offset_key_len = IndexShard::DOC_KEY_LEN; } $this->getGenerationInfo($this->current_generation); if ($this->index_version >= 3) { $doc_map_tools = $index->doc_map_tools; $doc_keys = $this->key_index; $doc_map = $this->doc_map; } $pre_results = []; $num_docs_so_far = 0; $termsfilter_len = IndexDocumentBundle::TERMSFILTER_LEN; do { if (($is_ascending && $this->next_offset >= $this->last_offset) || (!$is_ascending && $this->next_offset < 0)) { break; } if ($this->index_version < 3) { $posting = L\packPosting($this->next_offset >> 4, [1]); list($doc_id, $num_keys, $item) = $shard->makeItem($posting, $num_docs_or_links, $this->direction); } else { $doc_id = $doc_keys[$this->next_offset]; $map_entry = $doc_map[$doc_id]; // skip term filter if present $map_entry = ($map_entry >= ($termsfilter_len + 1) && $map_entry[0] == 't') ? substr($map_entry, $termsfilter_len + 1) : $map_entry; $doc_info = $doc_map_tools->unpack($map_entry); $item = [self::GENERATION => $this->current_generation]; $item[self::DOC_RANK] = $this->computeDocRank($doc_id, $this->next_offset, $this->current_generation, $this->num_generations, $this->last_offset, $this->last_offset, $this->last_offset, $this->ranking_factors, $is_ascending); list($item[self::DOC_LEN], ) = array_values(array_shift($doc_info)); $item[self::SCORE] = $item[self::DOC_RANK]; list(, $num_description_scores) = array_values(array_shift($doc_info)); $item[self::DESCRIPTION_SCORES] = array_slice($doc_info, 0, $num_description_scores); $item[self::USER_RANKS] = array_slice($doc_info, $num_description_scores); $item[self::INDEX_VERSION] = $this->index_version; $item[self::IS_DOC] = true; } if ($is_ascending) { if ($this->index_version < 3) { if ($num_keys % 2 == 0) { $num_keys++; } $this->next_offset += ($num_keys + 1) * $doc_offset_key_len; } else { $this->next_offset++; } } else { $this->next_offset = $this->getPreviousDocOffset( $this->next_offset); } $pre_results[$doc_id] = $item; $num_docs_so_far++; } while ($num_docs_so_far < $this->results_per_block); $results = []; $doc_offset_key_len = IndexShard::DOC_KEY_LEN; foreach ($pre_results as $keys => $data) { $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN); if (!empty($this->filter) && $this->filter->isFiltered($host_key)) { continue; } $data[self::KEY] = $keys; // inlinks is the domain of the inlink list($hash_url, $data[self::HASH], $data[self::INLINKS]) = str_split($keys, $doc_offset_key_len); $data[self::CRAWL_TIME] = $this->index_name; $results[$keys] = $data; } $this->count_block = count($results); if ($this->current_generation == $this->num_generations - 1 && $results == []) { $results = null; } $this->pages = $results; return $results; } /** * Get the document offset prior to the current $doc_offset * @param int $doc_offset an offset into the document map of an IndexShard * @return int previous doc_offset */ public function getPreviousDocOffset($doc_offset) { $doc_item_len = ($this->index_version < 3) ? 4 * IndexShard::DOC_KEY_LEN : 1; // this is not correct, only works if no additions doc keys return $doc_offset - $doc_item_len; } /** * Updates the seen_docs count during an advance() call */ public function advanceSeenDocs() { if ($this->current_block_fresh != true) { $is_ascending = ($this->direction == self::ASCENDING); $doc_item_len = ($this->index_version < 3) ? 4 * IndexShard::DOC_KEY_LEN : 1; $pre_num_docs = ($is_ascending) ? ($this->last_offset - $this->next_offset) / $doc_item_len : $this->next_offset/$doc_item_len; $num_docs = min($this->results_per_block, $pre_num_docs); $this->next_offset = $this->current_offset; if ($is_ascending) { $this->next_offset += $doc_item_len * $num_docs; } else { $this->next_offset -= $doc_item_len * $num_docs; } if ($num_docs < 0) { return; } } else { $num_docs = $this->count_block; } $this->current_block_fresh = false; $this->seen_docs += $num_docs; } /** * Forwards the iterator one group of docs * @param array $gen_doc_offset a generation, doc_offset pair. If set, * the must be of greater than or equal generation, and if equal the * next block must all have $doc_offsets larger than or equal to * this value */ public function advance($gen_doc_offset = null) { $is_ascending = ($this->direction == self::ASCENDING); $this->advanceSeenDocs(); if (($is_ascending && $this->current_offset < $this->next_offset) || (!$is_ascending && $this->current_offset > $this->next_offset)) { $this->current_offset = $this->next_offset; } else { $this->advanceGeneration(); $this->next_offset = $this->current_offset; } if (($is_ascending && $this->current_offset > $this->last_offset) || (!$is_ascending && $this->current_offset < 0)) { $this->advanceGeneration(); $this->next_offset = $this->current_offset; } if ($gen_doc_offset !== null) { if (($is_ascending && $this->current_generation < $gen_doc_offset[0]) || (!$is_ascending && $this->current_generation > $gen_doc_offset[0])) { $this->advanceGeneration($gen_doc_offset[0]); $this->next_offset = $this->current_offset; } if ($this->current_generation == $gen_doc_offset[0]) { $this->current_offset = ($is_ascending) ? max($this->current_offset, $gen_doc_offset[1]) : min($this->current_offset, $gen_doc_offset[1]); if (($is_ascending && $this->current_offset > $this->last_offset) || (!$is_ascending && $this->current_offset < $this->last_offset)) { $this->advanceGeneration(); $this->next_offset = $this->current_offset; } } $this->seen_docs = $this->current_offset / (($this->index_version < 3) ? 4 * IndexShard::DOC_KEY_LEN : 1); } } /** * Switches which index shard is being used to return occurrences of * the word to the next shard containing the word * * @param int $generation generation to advance beyond */ public function advanceGeneration($generation = null) { $is_ascending = ($this->direction == self::ASCENDING); if ($generation === null) { $generation = ($is_ascending) ? $this->current_generation + 1 : $this->current_generation - 1; } $this->current_generation = $generation; $this->current_offset = ($is_ascending) ? 0 : $this->last_offset; if (($is_ascending && $generation < $this->num_generations) || (!$is_ascending && $generation >= 0) ) { $this->getGenerationInfo($generation); } } /** * Gets the doc_offset and generation for the next document that * would be return by this iterator * * @return mixed an array with the desired document offset * and generation; -1 on fail */ public function currentGenDocOffsetWithWord() { $is_ascending = ($this->direction == self::ASCENDING); if (($is_ascending && ($this->current_offset > $this->last_offset || $this->current_generation >= $this->num_generations)) || (!$is_ascending && ($this->current_offset < 0 || $this->current_generation < 0))) { return -1; } return [$this->current_generation, $this->current_offset]; } }