Last commit for src/library/index_bundle_iterators/WordIterator.php: 88ba842636f692ac9bde972fed5a3cf6959d841b

Allows Arctool to rebuild/remerge a range of partitions, fixes term lookup bugs in WordIterator and IndexDocumentBundle

Chris Pollett [2024-02-04 02:Feb:th]
Allows Arctool to rebuild/remerge a range of partitions, fixes term lookup bugs in WordIterator and IndexDocumentBundle
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2019  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2019
 * @filesource
 */
namespace seekquarry\yioop\library\index_bundle_iterators;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\library\IndexShard;
use seekquarry\yioop\library\IndexManager;

/**
 * Used to iterate through the documents associated with a word in
 * an IndexArchiveBundle. It also makes it easy to get the summaries
 * of these documents.
 *
 * A description of how words and the documents containing them are stored
 * is given in the documentation of IndexArchiveBundle.
 *
 * @author Chris Pollett
 * @see IndexArchiveBundle
 */
class WordIterator extends IndexBundleIterator
{
    /**
     * hash of word or phrase that the iterator iterates over
     * @var string
     */
    public $word_key;
    /**
     * Position from end of key that doesn't have to be an exact match
     * (for phrases as using suffix tree)
     * @var int
     */
    public $shift;
    /**
     * The timestamp of the index is associated with this iterator
     * @var string
     */
    public $index_name;
    /**
     * First shard generation that word info was obtained for
     * @var int
     */
    public $start_generation;
    /**
     * Used to keep track of whether getWordInfo might still get more
     * data on the search terms as advance generations
     * @var bool
     */
    public $no_more_generations;
    /**
     * The next byte offset in the IndexShard
     * @var int
     */
    public $next_offset;
    /**
     * An array of shard generation and posting list offsets, lengths, and
     * numbers of documents
     * @var array
     */
    public $dictionary_info;
    /**
     * File name (including path) of the feed shard for news items
     * @var string
     */
    public $feed_shard_name;
    /**
     * Structure used to hold posting list start and stops for the query
     * in the feed shard
     * @var array
     */
    public $feed_info;
    /**
     * The total number of shards that have data for this word
     * @var int
     */
    public $num_generations;
    /**
     * Index into dictionary_info corresponding to the current shard
     * @var int
     */
    public $generation_pointer;
    /**
     * Numeric number of current shard
     * @var int
     */
    public $current_generation;
    /**
     * The current byte offset in the IndexShard
     * @var int
     */
    public $current_offset;
    /**
     * Starting Offset of word occurence in the IndexShard
     * @var int
     */
    public $start_offset;
    /**
     * Last Offset of word occurence in the IndexShard
     * @var int
     */
    public $last_offset;
    /**
     * Keeps track of whether the word_iterator list is empty because the
     * word does not appear in the index shard
     * @var int
     */
    public $empty;
    /**
     * Keeps track of whether the word_iterator list is empty because the
     * word does not appear in the index shard
     * @var int
     */
    public $filter;
    /**
     * The current value of the doc_offset of current posting if known
     * @var int
     */
    public $current_doc_offset;
    /** Host Key position + 1 (first char says doc, inlink or eternal link)*/
    const HOST_KEY_POS = 17;
    /** Length of a doc key*/
    const KEY_LEN = 8;
    /** If the $limit_feeds constructor input is true then limit the number
     * of items coming from the feed shard to this count.
     */
    const LIMIT_FEEDS_COUNT = 25;
    /**
     * Creates a word iterator with the given parameters.
     *
     * @param string $word_key hash of word or phrase to iterate docs of
     * @param string $shift up to what point in key should be a match
     *      when do dictionary look up (for phrases because using suffix tree)
     * @param string $index_name time_stamp of the to use
     * @param bool $raw whether the $word_key is our variant of base64 encoded
     * @param array $filter an array of hashes of domains to filter from
     *     results
     * @param int $results_per_block the maximum number of results that can
     *      be returned by a findDocsWithWord call
     * @param bool $limit_feeds feed results appear before all others when
     *      gotten out of this iterator (may be reordered later). This flag
     *      controls whether an upper bound of self::LIMIT_FEEDS_COUNT is
     *      imposed on the number of feed results returned
     */
    public function __construct($word_key, $shift, $index_name, $raw = false,
        &$filter = null,
        $results_per_block = IndexBundleIterator::RESULTS_PER_BLOCK,
        $limit_feeds = false)
    {
        if ($raw == false) {
            //get rid of out modified base64 encoding
            $word_key = L\unbase64Hash($word_key);
        }
        if ($filter != null) {
            $this->filter = & $filter;
        } else {
            $this->filter = null;
        }
        $this->word_key = $word_key;
        $this->shift = $shift;
        $this->index_name =  $index_name;
        list($estimated_total, $this->dictionary_info) =
            IndexManager::getWordInfo($index_name, $word_key, $shift,
            -1, -1, C\NUM_DISTINCT_GENERATIONS, true);
        $this->feed_shard_name = C\WORK_DIRECTORY . "/feeds/index";
        if ((!C\nsdefined('NO_FEEDS') || !C\NO_FEEDS)
            && file_exists($this->feed_shard_name)) {
            $this->use_feeds = true;
        } else {
            $this->use_feeds = false;
        }
        if ($this->use_feeds) {
            if (!isset($this->dictionary_info[-1])) {
                $this->feed_info = false;
                $this->feed_empty = true;
            } else {
                $this->feed_info = $this->dictionary_info[-1];
                unset($this->dictionary_info[-1]);
                $this->feed_empty = false;
            }
        } else {
            $this->feed_info = false;
            $this->feed_empty = true;
        }
        if (is_array($this->feed_info)) {
            list(,$this->feed_start, $this->feed_end, $this->feed_count,) =
                $this->feed_info;
            $this->feed_info = [$this->feed_start, $this->feed_end,
                $this->feed_count];
        } else {
            $this->feed_start = 0;
            $this->feed_end = 0;
            $this->feed_count = 0;
        }
        if ($this->feed_count > 0) {
            $this->using_feeds = true;
        } else {
            $this->using_feeds = false;
        }
        if ($limit_feeds && $this->feed_count > self::LIMIT_FEEDS_COUNT) {
            $this->feed_count = self::LIMIT_FEEDS_COUNT;
            $this->feed_end = $this->feed_start +
                IndexShard::POSTING_LEN * (self::LIMIT_FEEDS_COUNT - 1);
        }
        $this->num_docs = $this->feed_count + $estimated_total;
        if ($this->dictionary_info === false) {
            $this->empty = true;
        } else {
            ksort($this->dictionary_info);
            $this->dictionary_info = array_values($this->dictionary_info);
            $this->num_generations = count($this->dictionary_info);
            if ($this->num_generations == 0) {
                $this->empty = true;
            } else {
                $this->empty = false;
            }
        }
        $this->no_more_generations =
            ($this->num_generations < C\NUM_DISTINCT_GENERATIONS);
        $this->current_doc_offset = null;
        $this->results_per_block = $results_per_block;
        $this->current_block_fresh = false;
        $this->start_generation = 0;
        if ($this->dictionary_info !== false || $this->feed_info !== false) {
            $this->reset();
        }
    }
    /**
     * Resets the iterator to the first document block that it could iterate
     * over
     */
    public function reset()
    {
        if ($this->feed_count > 0) {
            $this->using_feeds = true;
        } else {
            $this->using_feeds = false;
        }
        $no_feeds = $this->feed_empty || !$this->use_feeds;
        if (!$this->empty) {//we shouldn't be called when empty - but to be safe
            if ($this->start_generation > 0) {
                list($estimated_total, $this->dictionary_info) =
                    IndexManager::getWordInfo($this->index_name,
                    $this->word_key, 0, -1, 0, C\NUM_DISTINCT_GENERATIONS,
                    true);
                $this->num_docs = $this->feed_count + $estimated_total;
                ksort($this->dictionary_info);
                $this->dictionary_info = array_values($this->dictionary_info);
                $this->num_generations = count($this->dictionary_info);
                $this->no_more_generations =
                    ($this->num_generations < C\NUM_DISTINCT_GENERATIONS);
            }
            list($this->current_generation, $this->start_offset,
                $this->last_offset, )
                = $this->dictionary_info[0];
        } else {
            $this->start_offset = 0;
            $this->last_offset = -1;
            $this->num_generations = -1;
        }
        if (!$no_feeds) {
            $this->current_offset = $this->feed_start;
            $this->current_generation = -1;
        } else {
            $this->current_offset = $this->start_offset;
        }
        $this->generation_pointer = 0;
        $this->count_block = 0;
        $this->seen_docs = 0;
        $this->current_doc_offset = null;
    }
    /**
     * Hook function used by currentDocsWithWord to return the current block
     * of docs if it is not cached
     *
     * @return mixed doc ids and score if there are docs left, -1 otherwise
     */
    public function findDocsWithWord()
    {
        $no_feeds = $this->feed_empty || !$this->use_feeds;
        $feed_in_use = $this->using_feeds && !$no_feeds;
        if ($this->empty && $no_feeds) {
            return -1;
        }
        if (!$feed_in_use &&(($this->generation_pointer>=$this->num_generations)
            || ($this->generation_pointer == $this->num_generations - 1 &&
            $this->current_offset > $this->last_offset))) {
            return -1;
        }
        $pre_results = [];
        if ($feed_in_use) {
            $this->next_offset = $this->current_offset;
            $feed_shard = IndexManager::getIndex("feed");
            if ($feed_shard) {
                $pre_results = $feed_shard->getPostingsSlice(
                    $this->feed_start,
                    $this->next_offset, $this->feed_end,
                    $this->results_per_block);
                $time = time();
                foreach ($pre_results as $keys => $pre_result) {
                    $pre_results[$keys][self::IS_FEED] = true;
                    $delta = $time - $pre_result[self::SUMMARY_OFFSET];
                    $pre_results[$keys][self::DOC_RANK] = 720000 /
                        max($delta, 1);
                }
            }
        } else if (!$this->empty) {
            $this->next_offset = $this->current_offset;
            $index = IndexManager::getIndex($this->index_name);
            $index->setCurrentShard($this->current_generation, true);
            //the next call also updates next offset
            $shard = $index->getCurrentShard();
            $pre_results = $shard->getPostingsSlice(
                $this->start_offset,
                $this->next_offset, $this->last_offset,
                $this->results_per_block);
        }
        $results = [];
        $doc_key_len = IndexShard::DOC_KEY_LEN;
        $filter = ($this->filter == null) ? [] : $this->filter;
        foreach ($pre_results as $keys => $data) {
            $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN);
            if (in_array($host_key, $filter) ) {
                continue;
            }
            $data[self::KEY] = $keys;
            // inlinks is the domain of the inlink
            $key_parts = str_split($keys, $doc_key_len);
            if (isset($key_parts[2])) {
                list($hash_url, $data[self::HASH], $data[self::INLINKS]) =
                    $key_parts;
            } else {
                continue;
            }
            if (!empty($data[self::IS_FEED])) {
                $data[self::CRAWL_TIME] = "feed";
            } else {
                $data[self::CRAWL_TIME] = $this->index_name;
            }
            $results[$keys] = $data;
        }
        $this->count_block = count($results);
        if ($this->generation_pointer == $this->num_generations - 1 &&
            $results == []) {
            $results = null;
        }
        $this->pages = $results;
        return $results;
    }
    /**
     * Updates the seen_docs count during an advance() call
     */
    public function advanceSeenDocs()
    {
        if ($this->current_block_fresh != true) {
            if ($this->using_feeds && $this->use_feeds) {
                $num_docs = min($this->results_per_block,
                    IndexShard::numDocsOrLinks($this->next_offset,
                        $this->feed_end));
            } else {
                $num_docs = min($this->results_per_block,
                    IndexShard::numDocsOrLinks($this->next_offset,
                        $this->last_offset));
            }
            $this->next_offset = $this->current_offset;
            $this->next_offset += IndexShard::POSTING_LEN * $num_docs;
            if ($num_docs < 0) {
                return;
            }
        } else {
            $num_docs = $this->count_block;
        }
        $this->current_block_fresh = false;
        $this->seen_docs += $num_docs;
    }
    /**
     * Forwards the iterator one group of docs
     * @param array $gen_doc_offset a generation, doc_offset pair. If set,
     *     the must be of greater than or equal generation, and if equal the
     *     next block must all have $doc_offsets larger than or equal to
     *     this value
     */
    public function advance($gen_doc_offset = null)
    {
        if ($gen_doc_offset == null) {
            $this->plainAdvance();
            return;
        }
        $cur_gen_doc_offset = $this->currentGenDocOffsetWithWord();
        if ($cur_gen_doc_offset == -1 ||
            $this->genDocOffsetCmp($cur_gen_doc_offset,
            $gen_doc_offset) >= 0) {
            return;
        }
        $this->plainAdvance();
        if ($this->current_generation < $gen_doc_offset[0]) {
            $this->advanceGeneration($gen_doc_offset[0]);
            $this->next_offset = $this->current_offset;
        }
        $using_feeds = $this->using_feeds && $this->use_feeds;
        if ($using_feeds) {
            $shard = IndexManager::getIndex("feed");
            $last = $this->feed_end;
        } else {
            $index = IndexManager::getIndex($this->index_name);
            $index->setCurrentShard($this->current_generation, true);
            $shard = $index->getCurrentShard();
            $last = $this->last_offset;
        }
        if ($this->current_generation == $gen_doc_offset[0]) {
            $offset_pair = $shard->nextPostingOffsetDocOffset(
                $this->next_offset, $last, $gen_doc_offset[1]);
            if ($offset_pair === false) {
                $this->advanceGeneration();
                $this->next_offset = $this->current_offset;
            } else {
                list($this->current_offset, $this->current_doc_offset) =
                    $offset_pair;
            }
        }
        if ($this->current_generation == -1) {
            $this->seen_docs = ($this->current_offset - $this->feed_start) /
                IndexShard::POSTING_LEN;
        } else {
            $this->seen_docs = ($using_feeds) ? $this->feed_count : 0;
            $this->seen_docs += ($this->current_offset - $this->start_offset) /
                IndexShard::POSTING_LEN;
        }
    }
    /**
     * Forwards the iterator one group of docs. This is what's called
     * by @see advance($gen_doc_offset) if $gen_doc_offset is null
     */
    public function plainAdvance()
    {
        $this->advanceSeenDocs();
        $this->current_doc_offset = null;
        if ($this->current_offset < $this->next_offset) {
            $this->current_offset = $this->next_offset;
        } else {
            $this->advanceGeneration();
            $this->next_offset = $this->current_offset;
        }
        $using_feeds = $this->using_feeds && $this->use_feeds;
        if (($using_feeds &&
            $this->current_offset > $this->feed_end) || (!$using_feeds &&
            $this->current_offset > $this->last_offset)) {
            $this->advanceGeneration();
            $this->next_offset = $this->current_offset;
        }
    }
    /**
     * Switches which index shard is being used to return occurrences of
     * the word to the next shard containing the word
     *
     * @param int $generation generation to advance beyond
     */
    public function advanceGeneration($generation = null)
    {
        if ($this->using_feeds && $this->use_feeds) {
            $this->using_feeds = false;
            $this->generation_pointer = -1;
        }
        if ($generation === null) {
            $generation = $this->current_generation;
        }
        do {
            if ($this->generation_pointer < $this->num_generations) {
                $this->generation_pointer++;
            }
            if ($this->generation_pointer < $this->num_generations) {
                list($this->current_generation, $this->start_offset,
                    $this->last_offset, )
                    = $this->dictionary_info[$this->generation_pointer];
                $this->current_offset = $this->start_offset;
            }
            if (!$this->no_more_generations &&
                $this->current_generation < $generation &&
                $this->generation_pointer >= $this->num_generations) {
                list($estimated_remaining_total, $info) =
                    IndexManager::getWordInfo($this->index_name,
                    $this->word_key, 0, -1, $this->num_generations,
                    C\NUM_DISTINCT_GENERATIONS, true);
                if (count($info) > 0) {
                    $this->num_docs = $this->seen_docs +
                        $estimated_remaining_total;
                    ksort($info);
                    $this->dictionary_info = array_merge($this->dictionary_info,
                        array_values($info));
                    $this->num_generations = count($this->dictionary_info);
                    $this->no_more_generations =
                        count($info) < C\NUM_DISTINCT_GENERATIONS;
                    //will increment back to where were next loop
                    $this->generation_pointer--;
                }
            }

        } while($this->current_generation < $generation &&
            $this->generation_pointer < $this->num_generations);
    }
    /**
     * Gets the doc_offset and generation for the next document that
     * would be return by this iterator
     *
     * @return mixed an array with the desired document offset
     * and generation; -1 on fail
     */
    public function currentGenDocOffsetWithWord() {
        if ($this->current_doc_offset !== null) {
            return [$this->current_generation, $this->current_doc_offset];
        }
        $feeds = $this->using_feeds && $this->use_feeds && !$this->feed_empty;
        if ( ($feeds && $this->current_offset > $this->feed_end) ||
            (!$feeds && ($this->current_offset > $this->last_offset||
            $this->generation_pointer >= $this->num_generations))) {
            return -1;
        }
        if ($feeds) {
            $index = IndexManager::getIndex("feed");
            $this->current_doc_offset =
                $index->docOffsetFromPostingOffset($this->current_offset);
            return [-1, $this->current_doc_offset];
        }
        $index = IndexManager::getIndex($this->index_name);
        $index->setCurrentShard($this->current_generation, true);
        $this->current_doc_offset = $index->getCurrentShard(
            )->docOffsetFromPostingOffset($this->current_offset);
        return [$this->current_generation, $this->current_doc_offset];
    }
}
ViewGit