Last commit for src/library/index_bundle_iterators/WordIterator.php: 88ba842636f692ac9bde972fed5a3cf6959d841b

Allows Arctool to rebuild/remerge a range of partitions, fixes term lookup bugs in WordIterator and IndexDocumentBundle

Chris Pollett [2024-02-04 02:Feb:th]
Allows Arctool to rebuild/remerge a range of partitions, fixes term lookup bugs in WordIterator and IndexDocumentBundle
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\library\index_bundle_iterators;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\library\IndexShard;
use seekquarry\yioop\library\IndexDocumentBundle;
use seekquarry\yioop\library\IndexManager;
use seekquarry\yioop\library\PartitionDocumentBundle;

/**
 * Used to iterate through the documents associated with a word in
 * an IndexArchiveBundle. It also makes it easy to get the summaries
 * of these documents.
 *
 * A description of how words and the documents containing them are stored
 * is given in the documentation of IndexArchiveBundle.
 *
 * @author Chris Pollett
 * @see IndexArchiveBundle
 */
class WordIterator extends IndexBundleIterator
{
    /**
     * Weighting factor to multiply to make a doc-rank (approximate score of
     * document based on its position in the index (when crawled)).
     * This weight affects the amount doc_rank determines the overall score of
     * a document.
     */
    const DOC_RANK_WEIGHT = 50;
    /**
     * Host Key position + 1 (first char says doc, inlink or external link)
     */
    const HOST_KEY_POS = 17;
    /**
     *  Length of a doc key part
     */
    const KEY_LEN = 8;
    /**
     * Word key above in our modified base 64 encoding
     * @var string
     */
    public $base64_word_key;
    /**
     * The current value of the doc_offset of current posting if known
     * @var int
     */
    public $current_doc_offset;
    /**
     * Numeric number of current shard
     * @var int
     */
    public $current_generation;
    /**
     * The current byte offset in the IndexShard (if older index)
     * @var int
     */
    public $current_offset;
    /**
     * An array of shard generation and posting list offsets, lengths, and
     * numbers of documents
     * @var array
     */
    public $dictionary_info;
    /**
     * Keeps track of whether the word_iterator list is empty because the
     * word does not appear in the index shard
     * @var int
     */
    public $empty;
    /**
     * Model responsible for keeping track of edited and deleted search results
     * @var SearchfiltersModel
     */
    public $filter;
    /**
     * Index into dictionary_info corresponding to the current shard
     * @var int
     */
    public $generation_pointer;
    /**
     * The timestamp of the index is associated with this iterator
     * @var string
     */
    public $index_name;
    /**
     * The index version affects how the iterator cycles through documents
     * There was a big change in index format between version 3 and prior
     * formats
     * @var int
     */
    public $index_version;
    /**
     * Whether word key corresponds to a meta word
     * @var string
     */
    public $is_meta;
    /**
     * Last Offset of word occurrence in the IndexShard
     * @var int
     */
    public $last_offset;
    /**
     * The next byte offset in the IndexShard
     * @var int
     */
    public $next_offset;
    /**
     * Used to keep track of whether getWordInfo might still get more
     * data on the search terms as advance generations
     * @var bool
     */
    public $no_more_generations;
    /**
     * The total number of shards that have data for this word
     * @var int
     */
    public $num_generations;
    /**
     * How url, keywords, and title words should influence relevance
     * and doc rank calculations
     * @var array
     */
    public $ranking_factors;
    /**
     * First shard generation that word info was obtained for
     * @var int
     */
    public $start_generation;
    /**
     * Starting Offset of word occurrence in the IndexShard
     * @var int
     */
    public $start_offset;
    /**
     * hash of word or phrase that the iterator iterates over
     * @var string
     */
    public $word_key;
    /**
     * Creates a word iterator with the given parameters.
     *
     * @param string $word_key hash of word or phrase to iterate docs of
     * @param string $index_name time_stamp of the to use
     * @param bool $raw whether the $word_key is our variant of base64 encoded
     * @param SearchfiltersModel $filter Model responsible for keeping track
     *      of edited and deleted search results
     * @param int $results_per_block the maximum number of results that can
     *      be returned by a findDocsWithWord call
     * @param int $direction when results are access from $index_name in
     *      which order they should be presented. self::ASCENDING is from first
     *      added to last added, self::DESCENDING is from last added to first
     *      added. Note: this value is not saved permanently. So you
     *      could in theory open two read only versions of the same bundle but
     *      reading the results in different directions
     * @param array $ranking_factors field say how url, keywords, and
     *     title words should influence relevance and doc rank calculations
     */
    public function __construct($word_key, $index_name, $raw = false,
        $filter = null, $results_per_block =
        IndexBundleIterator::RESULTS_PER_BLOCK, $direction=self::ASCENDING,
        $ranking_factors = [])
    {
        if ($raw == false) {
            //get rid of our modified base64 encoding
            $word_key = L\unbase64Hash($word_key);
        }
        $this->is_meta = (strpos(substr($word_key, 9), ":") !== false);
        $this->direction = $direction;
        $this->filter = $filter;
        $this->word_key = $word_key;
        $this->base64_word_key = L\base64Hash($word_key);
        $this->index_name = $index_name;
        $this->termInfoIteratorFields($index_name, $word_key);
        $this->current_doc_offset = null;
        $this->results_per_block = $results_per_block;
        $this->current_block_fresh = false;
        $this->start_generation = ($direction == self::ASCENDING) ? 0 :
            "ACTIVE";
        foreach (["CLD_URL_BONUS" => C\CLD_URL_BONUS,
            "HOST_URL_BONUS" => C\HOST_URL_BONUS,
            "HOST_KEYWORD_BONUS" => C\HOST_KEYWORD_BONUS,
            "PATH_KEYWORD_BONUS" => C\PATH_KEYWORD_BONUS,
            "TITLE_BONUS" => C\TITLE_BONUS,
            ] as $factor => $default) {
            $this->ranking_factors[$factor] = $ranking_factors[$factor] ??
                $default;
        }
        if (!$this->empty) {
            $this->reset();
        }
    }
    /**
     * Returns CrawlConstants::ASCENDING or CrawlConstants::DESCENDING
     * depending on the direction in which this iterator ttraverse the
     * underlying index archive bundle.
     *
     * @return int direction traversing underlying archive bundle
     */
    public function getDirection()
    {
        return $this->direction;
    }
    /**
     * Resets the iterator to the first document block that it could iterate
     * over
     */
    public function reset()
    {
        if (!$this->empty) {//we shouldn't be called when empty - but to be safe
            $this->termInfoIteratorFields($this->index_name,
                    $this->word_key);
            $info = ($this->direction == self::ASCENDING) ?
                $this->dictionary_info[0] : $this->dictionary_info[
                $this->num_generations - 1];
            if ($this->index_version < 3) {
                list($this->current_generation, $this->start_offset,
                    $this->last_offset, ) = $info;
            } else {
                $this->current_generation = $info['PARTITION'];
                $this->start_offset = 0;
                $this->last_offset = $info['NUM_DOCS'] - 1;
            }
        } else {
            $this->start_offset = 0;
            $this->last_offset = -1;
            $this->num_generations = -1;
        }
        if ($this->direction == self::ASCENDING) {
            $this->current_offset = $this->start_offset;
            $this->generation_pointer = 0;
        } else {
            $this->current_offset = $this->last_offset;
            /*  reset pointer to the number of gens, which in reverse is the
               first one we want
             */
            $this->generation_pointer = $this->num_generations - 1;
        }
        $this->count_block = 0;
        $this->seen_docs = 0;
        $this->current_doc_offset = null;
    }
    /**
     * Used to compute fields such as $this->total_num_docs for this iterator on
     * term $word_key for index $index_name
     *
     * @param string $index_name name of index to compute statistics with
     *      respect to
     * @param string $word_key term to compute statics with respect to
     */
    protected function termInfoIteratorFields($index_name, $word_key)
    {
        if (!empty($this->term_info_computed)) {
            return;
        }
        $this->index_version = IndexManager::getVersion($index_name);
        $word_info = IndexManager::getWordInfo($index_name, $word_key, -1, -1,
            C\NUM_DISTINCT_GENERATIONS, true);
        if ($this->index_version < 3) {
            list($this->num_docs, $this->dictionary_info) = $word_info;
        } else {
            $this->total_num_docs = $word_info['TOTAL_NUM_DOCS'] ?? 0;
            $this->total_num_docs_and_links =
                $word_info['TOTAL_NUM_LINKS_AND_DOCS'] ?? 0;
            $this->max_items_per_partition =
                $word_info['MAX_ITEMS_PER_PARTITION'] ??
                PartitionDocumentBundle::MAX_ITEMS_PER_FILE;
            $this->avg_items_per_partition =
                $word_info['AVG_ITEMS_PER_PARTITION'] ??
                PartitionDocumentBundle::MAX_ITEMS_PER_FILE;
            $this->total_number_of_partitions =
                $word_info['TOTAL_NUMBER_OF_PARTITIONS'] ?? 0;
            $this->num_docs = $word_info['TOTAL_COUNT'] ?? 0;
            $this->num_occurrences = $word_info['TOTAL_OCCURRENCES'] ?? 0;
            $this->dictionary_info = $word_info['ROWS'] ?? [];
            $this->threshold_exceeded = $word_info['THESHOLD_EXCEEDED'] ??
                false;
            $this->archive_file = $word_info['ARCHIVE_FILE'] ?? "";
        }
        if (empty($this->dictionary_info)) {
            $this->empty = true;
            $this->num_generations = 0;
        } else {
            if ($this->index_version < 3) {
                ksort($this->dictionary_info);
                $this->dictionary_info = array_values($this->dictionary_info);
            }
            $this->num_generations = count($this->dictionary_info);
            $this->empty = ($this->num_generations == 0);
        }
        $this->term_info_computed = true;
        $this->no_more_generations = ($this->num_generations <
            C\NUM_DISTINCT_GENERATIONS);
    }
    /**
     * Hook function used by currentDocsWithWord to return the current block
     * of docs if it is not cached
     *
     * @return mixed doc ids and score if there are docs left, -1 otherwise
     */
    public function findDocsWithWord()
    {
        if ($this->empty) {
            return -1;
        }
        $ascending = ($this->direction == self::ASCENDING);
        if ($ascending) {
            if (($this->generation_pointer >= $this->num_generations) ||
                $this->generation_pointer == $this->num_generations - 1 &&
                $this->current_offset > $this->last_offset) {
                return -1;
            }
        } else {
            if (($this->generation_pointer < 0)
                || ($this->generation_pointer == 0 &&
                $this->current_offset < $this->start_offset)) {
                return -1;
            }
        }
        $pre_results = [];
        if (!$this->empty) {
            $pre_results = $this->getPostingsSliceResults();
        }
        $results = [];
        $doc_key_len = self::KEY_LEN;
        foreach ($pre_results as $keys => $data) {
            $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN);
            if (!empty($this->filter) && $this->filter->isFiltered($host_key)) {
                continue;
            }
            // inlinks is the domain of the inlink
            $key_parts = str_split($keys, $doc_key_len);
            $data[self::KEY] = $keys;
            if (isset($key_parts[2])) {
                list(, $data[self::HASH], $data[self::INLINKS]) =
                    $key_parts;
            } else {
                continue;
            }
            $data[self::CRAWL_TIME] = $this->index_name;
            $results[$keys] = $data;
        }
        $this->count_block = count($results);
        if ($this->generation_pointer == $this->num_generations - 1 &&
            empty($pre_results)) {
            $results = -1;
        }
        $this->pages = $results;
        return $results;
    }
    /**
     * Given the current_offset, result_per_block, and index used get the
     * result_per_block postings starting from current_offset in the current
     * direction (ascending or descending) for the term word iterator
     * iterates over from the index.
     */
    public function getPostingsSliceResults()
    {
        $this->next_offset = $this->current_offset;
        if ($this->index_version < 3) {
            $index = IndexManager::getIndex($this->index_name);
            $index->setCurrentShard($this->current_generation, true);
            //the next call also updates next offset
            $shard = $index->getCurrentShard(true);
            $pre_results = $shard->getPostingsSlice($this->start_offset,
                $this->next_offset, $this->last_offset,
                $this->results_per_block, $this->direction);
            return $pre_results;
        }
        if ($this->direction == self::ASCENDING) {
            if ($this->current_offset < $this->start_offset) {
                $this->current_offset = $this->start_offset;
                $this->next_offset = $this->current_offset;
            }
            if ($this->next_offset > $this->last_offset) {
                return [];
            }
            $start_slice = $this->next_offset;
            $num_slice = min($this->results_per_block,
                $this->last_offset - $this->next_offset + 1);
            $this->next_offset += $num_slice;
        } else {
            if ($this->current_offset > $this->last_offset) {
                $this->current_offset = $this->last_offset;
                $this->next_offset = $this->current_offset;
            }
            if ($this->next_offset < $this->start_offset) {
                return [];
            }
            $num_slice = max($this->results_per_block,
                $this->start_offset);
            $this->next_offset -= $num_slice;
            $start_slice = $this->next_offset + 1;
        }
        $postings = $this->getGenerationPostings($this->generation_pointer);
        $postings = array_slice($postings, $start_slice, $num_slice);
        $key_postings = $this->getDocKeyPositionsScoringInfo($postings,
            $this->current_generation);
        return $key_postings;
    }
    /**
     * Add to a set of postings from a partition scoring information, position
     * list information and info about the relative weights of given position
     * based on the position list file and doc_map file.
     *
     *  @param array $postings posting data to add scoring information to
     *  @param int $partition which partition from the PartitionDocumentBundle
     *    postings a re related to
     */
    public function getDocKeyPositionsScoringInfo($postings, $partition)
    {
        $key_postings = [];
        $index = IndexManager::getIndex($this->index_name);
        $base_folder = $index->getPartitionBaseFolder($partition);
        $doc_key_len = IndexDocumentBundle::DOCID_PART_LEN;
        $doc_map_filename = $base_folder . "/" .
            IndexDocumentBundle::DOC_MAP_FILENAME;
        $doc_map_tools = $index->doc_map_tools;
        $positions_filename = $base_folder . "/" .
            IndexDocumentBundle::POSITIONS_FILENAME;
        $fh = (file_exists($positions_filename)) ?
            fopen($positions_filename, "r") : false;
        $number_of_partitions = $this->total_number_of_partitions;
        $num_doc_keys = $doc_map_tools->countTableEntries($doc_map_filename);
        $is_ascending = ($this->direction == self::ASCENDING);
        $num_seen_partitions = ($is_ascending) ?
            $partition + 1 : $number_of_partitions - $partition;
        $occurrences_per_doc = $this->num_occurrences /
            max($this->total_num_docs, 1);
        foreach ($postings as $posting) {
            $posting[self::GENERATION] = $partition;
            if ($posting['POSITIONS_LEN'] > 0 && !empty($fh)) {
                fseek($fh, $posting['POSITIONS_OFFSET']);
                $encoded_positions = fread($fh, $posting['POSITIONS_LEN']);
                $posting[self::POSITION_LIST] = L\decodePositionList(
                    $encoded_positions, $posting['FREQUENCY']);
            } else {
                $posting[self::POSITION_LIST] = [];
            }
            $doc_map_index = $posting['DOC_MAP_INDEX'];
            $entry = $doc_map_tools->findEntryAtIndexTableName(
                $doc_map_filename, $doc_map_index);
            $docid_len = IndexDocumentBundle::DOCID_LEN;
            if (strlen($entry) < $docid_len) {
                continue;
            }
            $doc_key = substr($entry, 0, $docid_len);
            $values = substr($entry, $docid_len);
            if (IndexDocumentBundle::isType($doc_key, "doc")) {
                $posting[self::IS_DOC] = true;
            }
            $doc_info = $doc_map_tools->unpack($values);
            if (empty($doc_info)) {
                continue;
            }
            $time = time();
            $posting[self::KEY] = $doc_key;
            list($posting[self::DOC_LEN], $original_score) =
                array_values(array_shift($doc_info));
            $is_timestamp_score = ($original_score <= $time &&
                $original_score > ($time >> 1));
            if ($is_timestamp_score) {
                $posting[self::SCORE] = 0.5 * log($time/
                    (max(1, $time - $original_score)), 2);
                $posting[self::DOC_RANK] = $posting[self::SCORE];
            } else {
                $posting[self::SCORE] =  ($is_ascending) ?
                    $num_doc_keys - $doc_map_index :
                    $doc_map_index;
                $remaining_partitions =  ($is_ascending) ?
                    $number_of_partitions - $num_seen_partitions :
                    $num_seen_partitions - 1;
                $posting[self::DOC_RANK] = log(
                    $remaining_partitions * $this->avg_items_per_partition +
                    $posting[self::SCORE], 10);
                if(L\IndexDocumentBundle::isAHostDocId($doc_key)) {
                    $posting[self::DOC_RANK] +=
                        (L\IndexDocumentBundle::isACldDocId($doc_key)) ?
                        $this->ranking_factors["CLD_URL_BONUS"] :
                        $this->ranking_factors["HOST_URL_BONUS"];
                }
            }
            list($preface_positions, $num_description_scores) =
                array_values(array_shift($doc_info));
            $posting["PATH_KEYWORDS_END_POS"] = ($preface_positions & 255);
            $preface_positions = $preface_positions >> 8;
            $posting["TITLE_END_POS"] = ($preface_positions & 255);
            $preface_positions = $preface_positions >> 8;
            $posting["HOST_KEYWORDS_END_POS"] = ($preface_positions & 255);
            $posting[self::DESCRIPTION_SCORES] = array_slice($doc_info, 0,
                $num_description_scores);
            if ($posting['FREQUENCY'] > 0) {
                list($frequency, $preface_score) =
                    $this->frequencyNormalizationPrefaceScoring(
                    $posting[self::POSITION_LIST],
                    $posting[self::DOC_LEN],
                    $posting["HOST_KEYWORDS_END_POS"],
                    $posting["TITLE_END_POS"],
                    $posting["PATH_KEYWORDS_END_POS"],
                    $posting[self::DESCRIPTION_SCORES]);
                // Divergence-from-randomness + preface score
                $posting[self::RELEVANCE] =
                    ((log(1 + $occurrences_per_doc, 2) + $frequency *
                    log(1 + 1/max(1, $occurrences_per_doc), 2)) /
                    ($frequency + 1)) + $preface_score;
            } else {
                 /*
                   this will typically be the relaveance score for a meta word
                   As will always be frequency 1 and have no position info
                   set close to 0. (Not zero to avoid div by 0's)
                  */
                 $posting[self::RELEVANCE] = 0.01;
            }
            $posting[self::SCORE] = $posting[self::DOC_RANK] +
                $posting[self::RELEVANCE];
            $posting[self::USER_RANKS] = array_slice($doc_info,
                $num_description_scores);
            $posting[self::INDEX_VERSION] = $this->index_version;
            $key_postings[$doc_key] = $posting;
        }
        if (!empty($fh)) {
            fclose($fh);
        }
        return $key_postings;
    }
    /**
     * Normalizes the frequencies of a term within a document with respect to
     * the length of the document, the positions of the term with the document
     * and the overall importance score for a given position within the document
     * Also computes the score of the posting for the host keywords,
     * title keywords, and path keywords.
     *
     * @param array $positions positions of this iterators term in the document
     * @param int $num_words number of terms in the document
     * @param int $host_keywords_end_pos term offset into the document summary
     *  that demarks the end of the host keywords portion of the summary
     * @param int $title_end_pos absolute term offset into the document summary
     *  that demarks the end of the title portion of the summary
     * @param int $path_keywords_end_pos absolute term offset into the document
     *  summary that demarks the end of the title portion of the summary
     * @param array $descriptions_scores boundaries and scores of different
     *  regions with document
     * @return array [normalized frequency, score for host name, title,
     *     and path keywords]
     */
    public function frequencyNormalizationPrefaceScoring(
        $positions, $num_words, $host_keywords_end_pos,
        $title_end_pos, $path_keywords_end_pos, $descriptions_scores)
    {
        $num_words = max($num_words, 1);
        /*
         * Amati and van Rijsbergen suggest a normalization of
         * log_2(1 + l_avg/l_d) for divergence-from-randomness
         * Here l_avg = average num words in a document, l_d = num words
         * current document. C\MAX_DESCRIPTION_LEN is the max number
         * of characters in a document. Assuming the average word is
         * around 5 chars + whitespace char + punctuation, and most documents
         * are summuarized, to close to the max character length, we
         * approximate l_avg as C\MAX_DESCRIPTION_LEN/7 in the below.
         */
        $length_normalization = log(1 + C\MAX_DESCRIPTION_LEN/(7 * $num_words),
            2);
        $first_index = 0;
        $old_pos = 0;
        if (empty($descriptions_scores)) {
            return count($positions);
        }
        $num_scores = count($descriptions_scores);
        $weighted_frequency = 0;
        $preface_score = 0;
        foreach ($positions as $position) {
            if ($position < $host_keywords_end_pos) {
                $preface_score += $this->ranking_factors["HOST_KEYWORD_BONUS"] /
                    max($host_keywords_end_pos - 1, 1);
                continue;
            } else if ($position < $title_end_pos) {
                $preface_score += $this->ranking_factors["TITLE_BONUS"] /
                    max($title_end_pos - $host_keywords_end_pos, 1);
                continue;
            } else if ($position < $path_keywords_end_pos) {
                $preface_score += $this->ranking_factors["PATH_KEYWORD_BONUS"] /
                    max($path_keywords_end_pos - $title_end_pos, 1);
                continue;
            }
            $last_index = $num_scores - 1;
            /* description score offsets are with respect to the description
               only so we subtract from the term position the offset of the
               non-description
             */
            $position -= ($path_keywords_end_pos + 1);
            while ($first_index < $last_index) {
                $mid_index = ceil(($first_index + $last_index)/2.0);
                if ($descriptions_scores[$mid_index]['POS'] > $position) {
                    $last_index = $mid_index - 1;
                } else {
                    $first_index = $mid_index;
                }
            }
            $weight = $descriptions_scores[$first_index]['SCORE'];
            $weighted_frequency += $weight;
        }
        $frequency = $weighted_frequency * $length_normalization;
        return [$frequency, $preface_score];
    }
    /**
     * Updates the seen_docs count during an advance() call
     */
    public function advanceSeenDocs()
    {
        $version = $this->index_version;
        if ($this->current_block_fresh != true) {
            if ($this->direction == self::ASCENDING) {
                $remaining_postings = ($version < 3) ?
                    IndexShard::numDocsOrLinks(
                    $this->next_offset, $this->last_offset) :
                    $this->last_offset - $this->next_offset;
                $num_docs = min($this->results_per_block, $remaining_postings);
                $delta_sign = 1;
            } else {
                if ($version < 3) {
                    $total_guess = IndexShard::numDocsOrLinks(
                        $this->start_offset, $this->next_offset);
                    $num_docs = $total_guess % $this->results_per_block;
                    if ($num_docs == 0) {
                        $num_docs = $this->results_per_block;
                    } else {
                        $num_docs = IndexShard::numDocsOrLinks(
                            $this->start_offset, $this->last_offset) %
                            $this->results_per_block;
                        if ($num_docs == 0) {
                            $num_docs = $this->results_per_block;
                        }
                    }
                } else {
                    $remaining_postings = $this->next_offset -
                        $this->start_offset + 1;
                    $num_docs = min($this->results_per_block,
                        $remaining_postings);
                }
                $delta_sign = -1;
            }
            $posting_len = ($version < 3) ? IndexShard::POSTING_LEN : 1;
            $this->next_offset = $this->current_offset;
            $this->next_offset += $delta_sign * $posting_len * $num_docs;
            if ($num_docs <= 0) {
                return;
            }
        } else {
            $num_docs = $this->count_block;
        }
        $this->current_block_fresh = false;
        $this->seen_docs += $num_docs;
    }
    /**
     * Forwards the iterator one group of docs
     * @param array $gen_doc_offset a generation, doc_offset pair. If not null,
     *     (in the ascending search case opposite for descending), the pair
     *     must be of greater than or equal generation, and if equal the
     *     next block must all have $doc_offsets larger than or equal to
     *     this value.
     */
    public function advance($gen_doc_offset = null)
    {
        if ($gen_doc_offset == null) {
            $this->plainAdvance();
            return;
        }
        $is_ascending = ($this->direction == self::ASCENDING);
        $cur_gen_doc_offset = $this->currentGenDocOffsetWithWord();
        if ($cur_gen_doc_offset == -1 ||
            $this->genDocOffsetCmp($cur_gen_doc_offset,
            $gen_doc_offset, $this->direction) >= 0) {
            return;
        }
        $advance_check = ($is_ascending) ?
            ($this->current_generation < $gen_doc_offset[0]) :
            ($this->current_generation > $gen_doc_offset[0]);
        if ($advance_check) {
            $this->advanceGeneration($gen_doc_offset[0]);
            $this->next_offset = $this->current_offset;
        }
        if ($this->index_version < 3) {
            $index = IndexManager::getIndex($this->index_name);
            $index->setCurrentShard($this->current_generation, true);
            $shard = $index->getCurrentShard();
        }
        if ($this->current_generation == $gen_doc_offset[0]) {
            if ($this->index_version < 3) {
                $end_offset = ($is_ascending) ? $this->last_offset :
                    $this->start_offset;
                $offset_pair = $shard->nextPostingOffsetDocOffset(
                    $this->next_offset, $end_offset, $gen_doc_offset[1],
                    $this->direction);
            } else {
                $offset_pair = $this->nextDocIndexOffsetPair(
                    $gen_doc_offset[1]);
            }
            if ($offset_pair === false) {
                $this->advanceGeneration();
                $this->next_offset = $this->current_offset;
            } else {
                list($this->current_offset, $this->current_doc_offset) =
                    $offset_pair;
                $this->next_offset = $this->current_offset;
            }
        }
        $posting_len = ($this->index_version < 3) ? IndexShard::POSTING_LEN : 1;
        if ($is_ascending) {
            $this->seen_docs = ($this->current_offset - $this->start_offset) /
                $posting_len;
        } else {
            $this->seen_docs = ($this->last_offset - $this->current_offset) /
                $posting_len;
        }
        $this->current_block_fresh = false;
    }
    /**
     * Computes a pair [posting_slice_offset, $doc_index], such that
     * the $doc_index when shift to make a doc_offset is greater than
     * $doc_offset and posting_slice_offset is the offset of the first
     * posting with this property.
     * @param int $doc_offset that we are try to find a posting whose
     *  doc_index has a bigger doc_offset
     * @return array [posting_slice_offset, $doc_index]
     */
    public function nextDocIndexOffsetPair($doc_offset)
    {
        $is_ascending = ($this->direction == self::ASCENDING);
        $end_offset = ($is_ascending)? $this->last_offset : $this->start_offset;
        $postings = $this->getGenerationPostings($this->generation_pointer);
        if (empty($postings[$end_offset]) ) {
            return false;
        }
        $last_doc = $postings[$end_offset]["DOC_MAP_INDEX"];
        if (($is_ascending && $last_doc < $doc_offset) ||
           (!$is_ascending && $last_doc > $doc_offset)) {
              return false;
        }
        $next_offset = ($this->next_offset ?? $this->current_offset);
        $last_offset = $next_offset;
        $next_doc = $postings[$next_offset]["DOC_MAP_INDEX"] ?? $doc_offset;
        $cmp = ($is_ascending) ?
            ($next_doc < $doc_offset && $next_offset <= $end_offset):
            ($next_doc > $doc_offset && $next_offset >= $end_offset);
        $delta = ($is_ascending) ? 1 : -1;
        while ($cmp)  {
            $last_offset = $next_offset;
            $next_offset += $delta;
            $delta *= 2;
            $next_doc = $postings[$next_offset]["DOC_MAP_INDEX"] ?? $doc_offset;
            $cmp = ($is_ascending) ?
                ($next_doc < $doc_offset && $next_offset <= $end_offset):
                ($next_doc > $doc_offset && $next_offset >= $end_offset);
        }
        if (($is_ascending && $next_offset > $end_offset) ||
            (!$is_ascending && $next_offset < $end_offset)) {
            $next_offset = $end_offset;
        }
        while(abs($next_offset - $last_offset) > 1) {
            $mid_offset = ($next_offset + $last_offset) >> 1;
            $mid_doc = $postings[$mid_offset]["DOC_MAP_INDEX"];
            $cmp = ($is_ascending) ?
                ($mid_doc < $doc_offset) : ($mid_doc > $doc_offset);
            if ($cmp) {
                $last_offset = $mid_offset;
            } else {
                $next_offset = $mid_offset;
                $next_doc = $mid_doc;
            }
        }
        if (abs($next_offset - $last_offset) == 1) {
            $next_doc = $postings[$next_offset]["DOC_MAP_INDEX"];
        }
        return [$next_offset, $next_doc];
    }
    /**
     * Forwards the iterator one group of docs. This is what's called
     * by @see advance($gen_doc_offset) if $gen_doc_offset is null
     */
    public function plainAdvance()
    {
        $is_ascending = ($this->direction == self::ASCENDING);
        $this->advanceSeenDocs();
        $this->current_doc_offset = null;
        $update_check = ($is_ascending) ?
            ($this->current_offset < $this->next_offset) :
            ($this->current_offset > $this->next_offset);
        if ($update_check) {
            $this->current_offset = $this->next_offset;
            $update_check = ($is_ascending) ?
                ($this->current_offset > $this->last_offset) :
                ($this->current_offset < $this->start_offset);
            if ($update_check) {
                $this->advanceGeneration();
                $this->next_offset = $this->current_offset;
            }
        } else {
            $this->advanceGeneration();
            $this->next_offset = $this->current_offset;
        }
    }
    /**
     * Switches which index shard is being used to return occurrences of
     * the word to the next shard containing the word
     *
     * @param int $generation generation to advance beyond
     */
    public function advanceGeneration($generation = null)
    {
        if ($generation === null) {
            $generation = $this->current_generation;
        }
        $this->generation_pointer ??= 0; //if not set set to 0
        $is_ascending = ($this->direction == self::ASCENDING);
        do {
            $gen_check = ($is_ascending) ?
                ($this->generation_pointer < $this->num_generations) :
                ($this->generation_pointer >= 0);
            if ($gen_check) {
                if ($is_ascending) {
                    $this->generation_pointer++;
                } else {
                    $this->generation_pointer--;
                }
            }
            $gen_check = ($is_ascending) ?
                $this->generation_pointer < $this->num_generations :
                $this->generation_pointer >= 0;
            if ($gen_check) {
                if ($this->index_version < 3) {
                    list($this->current_generation, $this->start_offset,
                        $this->last_offset, )
                        = $this->dictionary_info[$this->generation_pointer];
                } else {
                    $partition_info =
                        $this->dictionary_info[$this->generation_pointer];
                    $this->current_generation = $partition_info['PARTITION'];
                    $this->start_offset = 0;
                    $this->last_offset = ($partition_info['NUM_DOCS'] ?? 1) - 1;
                }
                $this->current_offset = ($is_ascending) ? $this->start_offset:
                    $this->last_offset;
            }
            if (!$this->no_more_generations) {
                $gen_check = ($is_ascending) ?
                    ($this->current_generation < $generation &&
                    $this->generation_pointer >= $this->num_generations) :
                    ($this->current_generation > $generation &&
                    $this->generation_pointer <= 0);
                if ($gen_check) {
                    $index_info = IndexManager::getWordInfo($this->index_name,
                        $this->word_key, 0, $this->num_generations,
                        C\NUM_DISTINCT_GENERATIONS, true);
                    if ($this->index_version < 3) {
                        list($estimated_remaining_total, $info) = $index_info;
                    } else {
                        $estimated_remaining_total = $index_info['TOTAL_COUNT'];
                        $info = $index_info["ROWS"];
                    }
                    if (count($info) > 0) {
                        $this->num_docs = $this->seen_docs +
                            $estimated_remaining_total;
                        ksort($info);
                        $this->dictionary_info = array_merge(
                            $this->dictionary_info, array_values($info));
                        $this->num_generations = count($this->dictionary_info);
                        $this->no_more_generations =
                            count($info) < C\NUM_DISTINCT_GENERATIONS;
                        //will increment back to where were next loop
                        if ($is_ascending) {
                            $this->generation_pointer--;
                        } else {
                            $this->generation_pointer++;
                        }
                    }
                }
            }
            $gen_check = ($is_ascending) ?
                ($this->current_generation < $generation &&
                $this->generation_pointer < $this->num_generations) :
                ($this->current_generation > $generation &&
                $this->generation_pointer >= 0);
        } while($gen_check);
    }
    /**
     * Given a partition number in the the index's PartitionDocumentBundle
     * retrieves all the posting for the word iterator's term in that
     * partition.
     *
     * @param int $generation partition to get  postings for
     * @return array of posting items
     */
    public function getGenerationPostings($generation)
    {
        static $test_time = 0;
        if ($this->index_version < 3 ||
            empty($this->dictionary_info[$generation])) {
            return [];
        }
        $generation_info = $this->dictionary_info[$generation];
        if (!empty($generation_info['POSTINGS']) &&
            is_array($generation_info['POSTINGS'])) {
            return $generation_info['POSTINGS']; //already loaded
        }
        $index = IndexManager::getIndex($this->index_name);
        if ($this->index_version < "3.2") {
            if (empty($generation_info['LAST_BLOB_LEN'])) {
                $postings_entry = "";
            } else {
                $postings_entry = $index->dictionary->getArchive(
                    $this->archive_file, $generation_info['POSTINGS'],
                    $generation_info['LAST_BLOB_LEN']);
                unset($this->dictionary_info[$generation]['LAST_BLOB_LEN']);
            }
        } else {
            if (empty($generation_info['POSTINGS_OFFSET']) ||
                empty($generation_info['POSTINGS_LEN'])) {
                $postings_entry = "";
            } else {
                $postings_entry = $index->getPostingsString($generation,
                    $generation_info['POSTINGS_OFFSET'],
                    $generation_info['POSTINGS_LEN']);
            }
        }
        if (empty($postings_entry)) {
            $postings = [];
        } else {
            list($postings,) = $index->unpackPostings($postings_entry);
        }
        $this->dictionary_info[$generation]['POSTINGS'] = $postings;
        return $postings;
    }
    /**
     * Gets the doc_offset and generation for the next document that
     * would be return by this iterator
     *
     * @return mixed an array with the desired document offset
     * and generation; -1 on fail
     */
    public function currentGenDocOffsetWithWord()
    {
        if ($this->current_doc_offset !== null) {
            return [$this->current_generation, $this->current_doc_offset];
        }
        $is_ascending = ($this->direction == self::ASCENDING);
        $offset_check = ($is_ascending) ?
            ($this->current_offset > $this->last_offset ||
            $this->generation_pointer >= $this->num_generations) :
            ($this->current_offset < $this->start_offset||
            $this->generation_pointer < -1);
        if ($offset_check) {
            return -1;
        }
        if ($this->index_version < 3) {
            $index = IndexManager::getIndex($this->index_name);
            $index->setCurrentShard($this->current_generation, true);
            $this->current_doc_offset = $index->getCurrentShard(
                )->docOffsetFromPostingOffset($this->current_offset);
        } else if (empty($this->dictionary_info[$this->generation_pointer])){
            return -1;
        } else {
            $partition_info = $this->dictionary_info[$this->generation_pointer];
            $this->current_generation = $partition_info['PARTITION'];
            $postings = $this->getGenerationPostings($this->generation_pointer);
            $this->current_doc_offset =
                $postings[$this->current_offset]['DOC_MAP_INDEX'] ?? -1;
        }
        return [$this->current_generation, $this->current_doc_offset];
    }
}
ViewGit