Last commit for src/library/IndexDocumentBundle.php: 88ba842636f692ac9bde972fed5a3cf6959d841b

Allows Arctool to rebuild/remerge a range of partitions, fixes term lookup bugs in WordIterator and IndexDocumentBundle

Chris Pollett [2024-02-04 02:Feb:th]
Allows Arctool to rebuild/remerge a range of partitions, fixes term lookup bugs in WordIterator and IndexDocumentBundle
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2021  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2021
 * @filesource
 */
namespace seekquarry\yioop\library;

use seekquarry\yioop\configs as C;

/**
 * Used for crawlLog, crawlHash, and garbageCollect
 */
require_once __DIR__ . '/Utility.php';
/**
 * Encapsulates a set of web page documents and an inverted word-index of terms
 * from these documents which allow one to search for documents containing a
 * particular word.
 *
 * @author Chris Pollett
 */
class IndexDocumentBundle implements CrawlConstants
{
    /**
     * File name used to store withing the folder of the IndexDocumentBundle
     * parameter/configuration information about the bundle
     */
    const ARCHIVE_INFO_FILE = "archive_info.txt";
    /**
     * The version of this IndexDocumentBundle. The lowest format number is
     * 3.0 as prior inverted index/document stores used IndexArchiveBundle's
     */
    const DEFAULT_VERSION = "3.0";
    /**
     * Default values for the configuration parameters of an
     * IndexDocumentBundle
     */
    const DEFAULT_PARAMETERS = ["DESCRIPTION" => "",
        "VERSION" => self::DEFAULT_VERSION
    ];
    /**
     * Subfolder of IndexDocumentBundle to store the btree with
     * term => posting list information (i.e., the inverted index)
     */
    const DICTIONARY_FOLDER = "dictionary";
    /**
     * DocIds are made of three parts: hash of url, hash of document, hash
     * of url hostname. Each of these hashes is  DOCID_PART_LEN long
     */
    const DOCID_PART_LEN = 8;
    /**
     * Length of DocIds used by this IndexDocumentBundle
     */
    const DOCID_LEN = 24;
    /**
     * Partition i in an IndexDocumentBundle has a subfolder i
     * within self::POSITIONS_DOC_MAP_FOLDER. Within this subfolder i,
     * self::DOC_MAP_FILENAME is the name of the file used to store the
     * document map for the partition. The document map consists of a sequence
     * of records associated with each doc_id of a document stored in the
     * partition. The first record is ["POS" => $num_words,
     * "SCORE" => floatval($global_score_for_document)]. The second record is:
     * ["POS" => $length_of_title_of_document, "SCORE" =>
     *          floatval($num_description_scores)]]
     * Here a description score is a score for the importance for a section
     * of a document. Subsequence records, list [POS => the length of the jth
     * section of the document, SCORE => its score].
     */
    const DOC_MAP_FILENAME = "doc_map";
    /**
     * Folder used to store the partition data of this IndexDocumentBundle
     * These will consits of .txt.gz files for each partition which are used
     * to store summaries of documents and actual documents (web pages) and
     * .ix files which are used to store doc_id and the associated offets to
     * their summary and actual document within the .txt.gz file
     */
    const DOCUMENTS_FOLDER = "documents";
    /**
     * Name of the last entries file used to help compute difference lists
     * for doc_index, and position list offsets used in postings for the
     * partition. This file is also used to track the total number of
     * occurences of term in a partition
     */
    const LAST_ENTRIES_FILENAME = "last_entries";
    /**
     * The filename of a file that is used to keep track of the integer that
     * says what is the next partition with documents that can be added to
     * this IndexDocumentBundle's dictionary. I.e., It should be that
     * next_partition <= save_partition
     */
    const NEXT_PARTITION_FILE = "next_partition.txt";
    /**
     * Names for the files which appear within a partition sub-folder
     */
    const PARTITION_FILENAMES = [self::DOC_MAP_FILENAME,
        self::LAST_ENTRIES_FILENAME, self::POSITIONS_FILENAME,
        self::POSTINGS_FILENAME];
    /**
     * Name of the file within a partitions positions_doc_maps folder used
     * to contain the partition's position list for all terms in partition.
     */
    const POSITIONS_FILENAME = "positions";
    /**
     * Name of the file within a partition's positions_doc_maps folder with
     * posting information for all terms in that partition. This consists of
     * key value pairs term_id => posting records for all documents with that
     * term.
     */
    const POSTINGS_FILENAME = "postings";
    /**
     * Name of the folder used to hold position lists and document maps. Within
     * this folder there is a subfolder for each partition which contains a
     * doc_map file, postings file for the docs within the partition,
     * position lists file for those postings, and a last_entries file
     * used in the computation of difference list for doc_index and position
     * list offsets, as well as number of occurences of terms.
     */
    const POSITIONS_DOC_MAP_FOLDER = "positions_doc_maps";
    /**
     * Folder name to use for this IndexDocumentBundle
     * @var string
     */
    public $dir_name;
    /**
     * A short text name for this IndexDocumentBundle
     * @var string
     */
    public $description;
    /**
     * structure contains info about the current partition
     * @var array
     */
    public $next_partition_to_add;
    /**
     * PartitionDocumentBundle for web page documents
     * @var object
     */
    public $documents;
    /**
     * IndexDictionary for all shards in the IndexArchiveBundle
     * This contains entries of the form (word, num_shards with word,
     * posting list info 0th shard containing the word,
     * posting list info 1st shard containing the word, ...)
     * @var object
     */
    public $dictionary;
    /**
     * Makes or initializes an IndexDocumentBundle with the provided parameters
     *
     * @param string $dir_name folder name to store this bundle
     * @param bool $read_only_archive whether to open archive only for reading
     *  or reading and writing
     * @param string $description a text name/serialized info about this
     *  IndexDocumentBundle
     * @param int $num_docs_per_partition the number of documents to be stored
     *  in a single partition
     * @param int $max_keys the maximum number of keys used by the BPlusTree
     *  used for the inverted index
     */
    public function __construct($dir_name, $read_only_archive = true,
        $description = null, $num_docs_per_partition =
        C\NUM_DOCS_PER_PARTITION, $max_keys = BPlusTree::MAX_KEYS)
    {
        $this->dir_name = $dir_name;
        $is_dir = is_dir($this->dir_name);
        if (!$is_dir && !$read_only_archive) {
            mkdir($this->dir_name);
            mkdir($this->dir_name . "/". self::POSITIONS_DOC_MAP_FOLDER);
        } else if (!$is_dir) {
            return false;
        }
        $archive_info_path = $this->dir_name . "/" . self::ARCHIVE_INFO_FILE;
        $this->archive_info = self::DEFAULT_PARAMETERS;
        if (!empty($description)) {
            $this->archive_info["DESCRIPTION"] = $description;
        }
        if(file_exists($archive_info_path)) {
            $this->archive_info = unserialize(file_get_contents(
                $archive_info_path));
        } else if (!$read_only_archive) {
            file_put_contents($archive_info_path,
                serialize($this->archive_info));
        }
        $next_partition_path = $this->dir_name . "/".
            self::NEXT_PARTITION_FILE;
        if (file_exists($next_partition_path)) {
            $this->next_partition_to_add = intval(
                file_get_contents($next_partition_path));
        } else if (!$read_only_archive) {
            $this->next_partition_to_add = 0;
            file_put_contents($next_partition_path,
                $this->next_partition_to_add);
        }
        $this->documents = new PartitionDocumentBundle($dir_name . "/" .
            self::DOCUMENTS_FOLDER, ["PRIMARY KEY" => [self::DOC_ID,
            self::DOCID_LEN],
            self::SUMMARY => "SERIAL", self::PAGE => "SERIAL"],
            $num_docs_per_partition,
            PartitionDocumentBundle::PARTITION_SIZE_THRESHOLD,
            C\NS_COMPRESSORS . "GzipCompressor");
        if (!$read_only_archive) {
            $this->documents->index_cache_size = 1;
        }
        $this->doc_map_tools = new PackedTableTools([
            "PRIMARY KEY" => ["DOC_KEYS", 24], "POS" => "INT",
            "SCORE" => "FLOAT"], C\NS_COMPRESSORS . "GzipCompressor");
        $this->postings_tools = new PackedTableTools([
            "PRIMARY KEY" => ["TERM", 16], "DOC_INDEX" => "INT",
                "FREQUENCY" => "INT", "POSITIONS_OFFSET" => "INT",
                "POSITIONS_LEN" => "INT"], C\NS_COMPRESSORS .
                "GzipCompressor");
        $this->last_entries_tools = new PackedTableTools([
            "PRIMARY KEY" => ["TERM", 16], "LAST_INDEX" => "INT",
            "LAST_OFFSET" => "INT", "NUM_OCCURRENCES" => "INT"],
            C\NS_COMPRESSORS . "GzipCompressor");
        if (!$read_only_archive) {
            $this->documents->initCountIfNotExists("VISITED_URLS_COUNT");
        }
        $this->dictionary = new BPlusTree($this->dir_name . "/" .
            self::DICTIONARY_FOLDER, ["PRIMARY KEY" => ["TERM", 16],
            "PARTITION" => "INT", "NUM_DOCS" => "INT",
            "NUM_OCCURRENCES" => "INT", "POSTINGS" => "BLOB"], $max_keys,
            C\NS_COMPRESSORS . "GzipCompressor");
    }
    /**
     * Add the array of $pages to the documents PartitionDocumentBundle
     *
     * @param array $pages data to store
     * @param int $visited_urls_count number to add to the count of visited urls
     *     (visited urls is a smaller number than the total count of objects
     *     stored in the index).
     * @return bool success or failure of adding the pages
     */
    public function addPages($pages, $visited_urls_count)
    {
        crawlLog("Indexer adding pages to document bundle...");
        $success = $this->documents->put($pages);
        $this->documents->addCount($visited_urls_count,
            "VISITED_URLS_COUNT");
        return $success;
    }
    /**
     * For every partition between next partition and save partition, adds
     * the posting list information to the dictionary BPlusTree. At the
     * end of this process next partition and save partition should be the same
     *
     * @param string $taking_too_long_touch a filename of a file to touch
     *  so its last modified time becomes the current time. In a typical
     *  Yioop crawl this is done for the crawl_status.txt file to prevent
     *  Yioop's web interface from stopping the crawl because it has seen
     *  no recent  progress activity on a crawl.
     */
    public function updateDictionary($taking_too_long_touch = null)
    {
        $next_partition = $this->next_partition_to_add;
        $save_partition = $this->documents->parameters["SAVE_PARTITION"];
        $current_num_docs = $this->documents->parameters['ACTIVE_COUNT'];
        crawlLog("Current save partition has $current_num_docs documents.");
        $memory_limit = metricToInt(ini_get("memory_limit"));
        $before_usage = memory_get_usage();
        crawlLog("Indexer Memory  limit is $memory_limit. Usage is " .
            $before_usage);
        $advanced_partition = false;
        while ($next_partition < $save_partition) {
            crawlLog("Indexer adding Partition to dictionary...");
            crawlLog("...because save partition changed");
            $switch_time = microtime(true);
            // Save current shard dictionary to main dictionary
            measureCall($this, "buildInvertedIndexPartition",
                [$next_partition, $taking_too_long_touch]);
            $num_freed = garbageCollect();
            $this->addPartitionPostingsDictionary(
                $next_partition, $taking_too_long_touch);
            crawlLog("Indexer force running garbage collector after partition".
                 " advance. This freed $num_freed bytes.");
            $after_usage = memory_get_usage();
            crawlLog(
                "Indexer after partition changed memory usage: $after_usage");
            crawlLog("Switch Partition time:".
                changeInMicrotime($switch_time));
            $next_partition++;
            file_put_contents($this->dir_name . "/". self::NEXT_PARTITION_FILE,
                $next_partition);
            $advanced_partition = true;
        }
        $this->next_partition_to_add = $next_partition;
        return $advanced_partition;
    }
    /**
     * Adds the previously constructed inverted index $partition to the inverted
     * index of the whole bundle
     *
     * @param int $partition which partitions inverted index to add, by
     *  default the current save partition
     * @param string $taking_too_long_touch a filename of a file to touch
     *  so its last modified time becomes the current time. In a typical
     *  Yioop crawl this is done for the crawl_status.txt file to prevent
     *  Yioop's web interface from stopping the crawl because it has seen
     *  no recent  progress activity on a crawl.
     */
    public function addPartitionPostingsDictionary($partition = -1,
        $taking_too_long_touch = null)
    {
        $save_partition = $this->documents->parameters["SAVE_PARTITION"];
        if ($partition < 0 ) {
            if ($save_partition <= 0) {
                return false;
            }
            $partition = $save_partition - 1;
        }
        $base_folder = $this->getPartitionBaseFolder($partition);
        $postings_tools = $this->postings_tools;
        $last_entries_tools = $this->last_entries_tools;
        $dictionary = $this->dictionary;
        $postings_filename = $base_folder . "/" . self::POSTINGS_FILENAME;
        $last_entries_filename = $base_folder . "/" .
            self::LAST_ENTRIES_FILENAME;
        if (!file_exists($postings_filename)) {
            crawlLog($postings_filename);
            crawlLog("Postings file for partition $partition does not exist");
            return false;
        }
        if (!file_exists($last_entries_filename)) {
            crawlLog(
                "Last entries file for partition $partition does not exist");
            return false;
        }
        crawlLog("Start Adding Partition Posting Info to Dictionary");
        $start_time = microtime(true);
        $postings = $postings_tools->load($postings_filename);
        $last_entries = $last_entries_tools->load($last_entries_filename);
        $num_postings = count($postings);
        $i = 0;
        foreach ($postings as $term => $entry) {
            if(crawlTimeoutLog("..Indexer Still processing partition ".
                "$partition. Have completed $i postings of $num_postings.") &&
                $taking_too_long_touch) {
                if (file_exists($taking_too_long_touch)) {
                    touch($taking_too_long_touch, time());
                }
            }
            $start = 0;
            $num_docs_term = vByteDecode($entry, $start);
            $num_occurrences_term = 0;
            $last_entry = $last_entries_tools->find($last_entries, $term);
            if (!empty($last_entry)) {
                $last_entry_row =
                    $last_entries_tools->unpack($last_entry);
                $num_occurrences_term = $last_entry_row[0]["NUM_OCCURRENCES"];
            }
            $dictionary->put(["TERM" => $term, "PARTITION" => $partition,
                "NUM_DOCS" => $num_docs_term,
                "NUM_OCCURRENCES"  => $num_occurrences_term,
                "POSTINGS" => $entry]);
            $i++;
        }
        $dictionary->flushLastPutNode();
        crawlLog("...Finished Adding Partition Posting Info to " .
            "Dictionary: " . changeInMicrotime($start_time));
        if (!C\nsdefined("KEEP_PARTITION_CALCULATIONS") ||
            !C\KEEP_PARTITION_CALCULATIONS) {
            crawlLog("Deleting partition posting calculations..");
            if (file_exists($postings_filename)) {
                unlink($postings_filename);
            }
            if (file_exists($last_entries_filename)) {
                unlink($last_entries_filename);
            }
            crawlLog("..Done deleting partition posting calculations.");
        }
    }
    /**
     * Gets the file path corresponding to the partition with index $partition
     *
     * @param int $partition desired partition index
     * @return string file path to where this partitions index data is stored
     *  (Not the original documents which are stored in the
     *  PartitionDocumentBundle)
     */
    public function getPartitionBaseFolder($partition)
    {
        $base_folder = $this->dir_name . "/" . self::POSITIONS_DOC_MAP_FOLDER
            . "/$partition";
        return $base_folder;
    }
    /**
     * Given the $doc_id of a document and a $partition to look for it in
     * return's the document summary info if present and [] otherwise.
     *
     * @param string $doc_id of document to look up
     * @param int $partition to look for document in
     * @return array desired summary or [] if look up failed
     */
    public function getSummary($doc_id, $partition)
    {
        $row = $this->documents->get($doc_id, $partition, [self::SUMMARY]);
        return $row[self::SUMMARY] ?? [];
    }
    /**
     * Given the $doc_id of a document and a $partition to look for it in
     * return's the cached page of the document if present and [] otherwise
     *
     * @param string $doc_id of document to look up
     * @param int $partition to look for document in
     * @return array desired page cache or [] if look up failed
     */
    public function getCachePage($doc_id, $partition)
    {
        $row = $this->documents->get($doc_id, $partition, [self::PAGE]);
        return $row[self::PAGE] ?? [];
    }
    /**
     * Builds an inverted index shard for a documents PartitionDocumentBundle
     * partition.
     * @param int $partition to build index for
     * @param string $taking_too_long_touch a filename of a file to touch
     *  so its last modified time becomes the current time. In a typical
     *  Yioop crawl this is done for the crawl_status.txt file to prevent
     *  Yioop's web interface from stopping the crawl because it has seen
     *  no recent  progress activity on a crawl.
     * @return mixed whether job executed to completion (true or false) if
     *  !$just_stats, otherwise, an array with NUM_DOCS, NUM_LINKS,
     *  and TERM_STATISTICS (the latter having term frequency info)
     */
    public function buildInvertedIndexPartition($partition = -1,
        $taking_too_long_touch = null, $just_stats = false)
    {
        $start_time = microtime(true);
        crawlLog("  Indexer start building inverted index ...  Current Memory:".
            memory_get_usage());
        if ($partition < 0) {
            $partition = $this->documents->parameters["SAVE_PARTITION"];
        }
        crawlLog(
            "Indexer Building index inverted index for partition $partition");
        $base_folder = $this->getPartitionBaseFolder($partition);
        if (!file_exists($base_folder)) {
            mkdir($base_folder);
        }
        /* set up $doc_map_filename, $postings_filename, $postings_filename,
           $positions_filename, etc
         */
        foreach (self::PARTITION_FILENAMES as $filename) {
            $component_filename = $base_folder . "/" . $filename;
            if (file_exists($component_filename)) {
                unlink($component_filename);
            }
            $component = $filename . "_filename";
            $$component = $component_filename;
        }
        $doc_map_tools = $this->doc_map_tools;
        $postings_tools = $this->postings_tools;
        $last_entries_tools = $this->last_entries_tools;
        $doc_map = [];
        $postings = [];
        $last_entries = [];
        $positions = "";
        crawlLog("Indexer Preparing Index Map...");
        $index_map = measureCall($this, "prepareIndexMap", [$partition]);
        crawlLog("Number of documents in mapped partition:" .
            count($index_map));
        $cnt = 0;
        $non_aux_doc_cnt = 0;
        $link_cnt = 0;
        $num_partition = count($index_map);
        foreach ($index_map as $hash_url => $url_info) {
            $site = [];
            $non_aux_doc_cnt++;
            if (!empty($url_info['doc'])) {
                $site = $this->getSummary($url_info['doc'], $partition);
                if (empty($site)) {
                    continue;
                }
            }
            $max_description_len ??= C\MAX_DESCRIPTION_LEN;
            $max_description_len = (empty($site[self::DESCRIPTION])) ?
                $max_description_len : max($max_description_len,
                strlen($site[self::DESCRIPTION]));
            $metas_only = ($url_info['aux_docs'] == 'metas_only');
            $aux_sites = [];
            $aux_description = "";
            if ($metas_only) {
                $site[self::JUST_METAS] = true;
            } else {
                foreach ($url_info['aux_docs'] as $aux_doc) {
                    $aux_site = $this->getSummary($aux_doc, $partition);
                    if (empty($aux_site) || !is_array($aux_site)) {
                        continue;
                    }
                    $aux_site[self::JUST_METAS] = true;
                    $aux_sites[] = $aux_site;
                    if (!empty($aux_site[self::DESCRIPTION])) {
                        if (strlen($aux_description) +
                            strlen($aux_site[self::DESCRIPTION]) <
                            $max_description_len) {
                            $aux_description .= " .. " .
                                $aux_site[self::DESCRIPTION];
                        }
                    }
                }
                if (empty($site) && !empty($aux_sites)) {
                    $site = array_shift($aux_sites);
                    if (!is_array($site)) {
                        continue;
                    }
                    unset($site[self::JUST_METAS]);
                }
                if (!is_array($site)) {
                    continue;
                }
                $site[self::DESCRIPTION] ??= "";
                $site[self::DESCRIPTION] .= $aux_description;
            }
            array_unshift($aux_sites, $site);
            foreach ($aux_sites as $site) {
                $cnt++;
                $interim_time = microtime(true);
                if (!isset($site[self::HASH]) ||
                    (isset($site[self::ROBOT_METAS]) &&
                    in_array("JUSTFOLLOW", $site[self::ROBOT_METAS]))) {
                    continue;
                }
                //this case  might occur on a recrawl
                if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
                    $is_link = true;
                    $link_cnt++;
                    $site_url = $site[self::TITLE];
                    $host =  UrlParser::getHost($site_url);
                    $link_parts = explode('|', $site[self::HASH]);
                    if (isset($link_parts[5])) {
                        $link_origin = $link_parts[5];
                    } else {
                        $link_origin = $site_url;
                    }
                    $url_info = [];
                    if (!empty($site[self::LANG])) {
                        $url_info[self::LANG] = $site[self::LANG];
                    }
                    $meta_ids = PhraseParser::calculateLinkMetas($site_url,
                        $host, $site[self::DESCRIPTION], $link_origin,
                        $url_info);
                    $link_to = "LINK TO:";
                } else {
                    $is_link = false;
                    $site_url = str_replace('|', "%7C", $site[self::URL]);
                    $meta_ids =  PhraseParser::calculateMetas($site, false);
                    $link_to = "";
                }
                $word_lists = [];
                $title_length = 0;
                $triplet_lists = [];
                /*
                    self::JUST_METAS check to avoid getting sitemaps in results
                    for popular words
                 */
                $lang = null;
                $is_safe = false;
                if (!isset($site[self::JUST_METAS])) {
                    $host_words = UrlParser::getWordsInHostUrl($site_url);
                    $path_words = UrlParser::getWordsLastPathPartUrl(
                        $site_url);
                    if ($is_link) {
                        $phrase_string = $site[self::DESCRIPTION];
                    } else {
                        if (isset($site[self::LANG])) {
                            if (isset($this->programming_language_extension[
                                $site[self::LANG]])) {
                                $phrase_string = $site[self::DESCRIPTION];
                            } else {
                                $phrase_string = $host_words . " ".
                                    $site[self::TITLE] . " ". $path_words .
                                    " tztzlzngth ". $site[self::DESCRIPTION];
                            }
                        } else {
                            $phrase_string = $host_words . " " .
                                $site[self::TITLE] . " ". $path_words .
                                " tztzlzngth ". $site[self::DESCRIPTION];
                        }
                    }
                    if (empty($site[self::LANG])) {
                        $lang = guessLocaleFromString(
                            $site[self::DESCRIPTION]);
                    } else {
                        $lang = $site[self::LANG];
                    }
                    $word_and_qa_lists = PhraseParser::extractPhrasesInLists(
                        $phrase_string, $lang);
                    $word_lists = $word_and_qa_lists['WORD_LIST'];
                    if (!empty($word_lists["tztzlzngth"][0])) {
                        $title_length = $word_lists["tztzlzngth"][0] + 1;
                        unset($word_lists["tztzlzngth"]);
                    }
                    $len = strlen($phrase_string);
                    if (isset($this->programming_language_extension[$lang]) ||
                        PhraseParser::computeSafeSearchScore($word_lists, $len,
                            $site_url) < 0.012) {
                        $meta_ids[] = "safe:all";
                        $meta_ids[] = "safe:true";
                        $is_safe = true;
                    } else {
                        $meta_ids[] = "safe:all";
                        $meta_ids[] = "safe:false";
                        $is_safe = false;
                    }
                }
                $description_scores =
                    (empty($site[self::DESCRIPTION_SCORES])) ? [] :
                    $site[self::DESCRIPTION_SCORES];
                $user_ranks =
                    (empty($site[self::USER_RANKS])) ? [] :
                    $site[self::USER_RANKS];
                $doc_map_index = count($doc_map);
                $num_words = 0;
                foreach($word_lists as $word => $position_list)
                {
                    $num_words += count($position_list);
                }
                $doc_id = ($url_info['doc'] ??
                    ($url_info['aux_docs'][0] ?? ""));
                if (empty($doc_id)) {
                    continue;
                }
                $this->addScoresDocMap($doc_map, $doc_id, $num_words,
                    $url_info['score'], $title_length, $description_scores,
                    $user_ranks);
                $this->addTermPostingLists($postings, $positions, 0, $num_words,
                    $word_lists, $meta_ids, $doc_map_index, $last_entries);
                $interim_elapse = changeInMicrotime($interim_time);
                if ($interim_elapse > 5) {
                    crawlLog("..Indexer Inverting " . $link_to . $site_url .
                    "...took > 5s.");
                }
                $memory_usage = memory_get_usage();
                if (crawlTimeoutLog("..Indexer Still building inverted index. ".
                    "\n....Current Indexer Memory Usage is %s.\n" .
                    "....Indexer has processed %s of %s documents.\n" .
                    "....Total links or docs processed by Indexer is %s.\n" .
                    "....Last url Indexer processed was %s.",
                    $memory_usage,
                    $non_aux_doc_cnt, $num_partition, $non_aux_doc_cnt + $cnt,
                    $link_to . $site_url) && $taking_too_long_touch) {
                    if (file_exists($taking_too_long_touch)) {
                        touch($taking_too_long_touch, time());
                    }
                }
            }
        }
        if ($just_stats) {
            $term_stats = [];
            foreach ($postings as $term => $postings) {
                $posting_records = $postings_tools->unpack($postings);
                $term_stats[$term] = count($posting_records);
            }
            $statistics = [
                "NUM_DOCS" => count($doc_map),
                "NUM_LINKS" => $link_cnt,
                "TERM_STATISTICS" => $term_stats
            ];
            return $statistics;
        }
        $doc_map_tools->save($doc_map_filename, $doc_map);
        $postings_tools->save($postings_filename, $postings);
        $last_entries_tools->save($last_entries_filename, $last_entries);
        crawlLog("  Indexer build inverted index time ".
            changeInMicrotime($start_time));
        return true;
    }
    /**
     * Given a $site array of information about a web page/document. Use
     * CrawlConstant::URL and CrawlConstant::HASH fields to compute a
     * unique doc id for the array.
     *
     * @param array $site site to compute doc_id for
     */
    public function computeDocId($site)
    {
        $doc_id = false;
        if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
            $doc_id = $site[self::HTTP_CODE];
        } else {
            $site_url = str_replace('|', "%7C", $site[self::URL]);
            $host = UrlParser::getHost($site_url);
            $doc_id = crawlHash($site_url, true) . $site[self::HASH] .
                "d". substr(crawlHash($host . "/", true), 1);
        }
        return $doc_id;
    }
    /**
     * Used to add a doci_id => doc_record to the current partition's
     * document map ($doc_map). A doc record records the number of words
     * in the document, an overall length of the document, the length of its
     * title, scores for each of the sentences included into the summary
     * for the documents, and classifier scores for each classifier that was
     * used by the crawl.
     *
     * @param array& $doc_map associative array of docid=>doc_record pairs
     *  that this method will modify
     * @param string $doc_id new document id to add a record for
     * @param int $num_words number of terms in the document associated with the
     *  doc-id
     * @param float $score overall score for the important of this document
     * @param int $title_length length of the title portion of the document
     *  summary in terms
     * @param array $description_scores pairs of the form (length of summary
     *  portion, score for that portion)
     * @param array $user_ranks for each user defined classifier for this crawl
     *  the float score of the classifier on this document
     */
    public function addScoresDocMap(&$doc_map, $doc_id, $num_words, $score,
        $title_length, $description_scores, $user_ranks)
    {
        $doc_map_tools = $this->doc_map_tools;
        $num_description_scores = count($description_scores);
        $out_rows = [["POS" => $num_words, "SCORE" => floatval($score)],
            ["POS" => $title_length, "SCORE" =>
                floatval($num_description_scores)]];
        foreach($description_scores as $position => $score) {
            $out_rows[] = ["POS" => $position, "SCORE" =>floatval($score)];
        }
        foreach($user_ranks as $user_rank) {
            $out_rows[] = ["POS" => 0, "SCORE" => floatval($score)];
        }
        $entry = $doc_map_tools->pack($out_rows);
        $doc_map_tools->add($doc_map, $doc_id, $entry);
    }
    /**
     * Adds posting records associated to a document to the posting lists for
     * a partition.
     *
     * @param array& $postings associative array $term_id => posting list
     *  records for that term in the partition.
     * @param string $positions a string consisting of a concatenated sequence
     *  term position information for each document in turn and within this for
     *  each term in that document.
     * @param int $position_offset number of header bytes that might be used
     *  before including any position data in the file that positions will
     *  eventually be stored.
     * @param int $doc_length length of document in terms for the document
     *  for which we are adding posting data.
     * @param array $word_lists term => positions within current document of
     *  that term for the document whose posting data we are adding
     * @param array $meta_ids meta terms associated with the document we are
     *  adding. An example, meta term might be "media:news"
     * @param int $doc_map_index which document within the partition is the one
     *  we are adding. I.e., 5 would mean there were 5 earlier documents whose
     *  postings we have already added.
     * @param array& $last_entries used to keep track of the previous values
     *  posting quantities so difference lists can be computed. For example,
     *  previous $doc_map_index, previous position list offset. It also tracks
     *  the total number of occurences of a term within a partition.
     */
    public function addTermPostingLists(&$postings, &$positions,
        $position_offset, $doc_length, $word_lists, $meta_ids, $doc_map_index,
        &$last_entries)
    {
        $postings_tools = $this->postings_tools;
        $last_entries_tools = $this->last_entries_tools;
        foreach ($meta_ids as $meta_id) {
            $word_lists[$meta_id] = [];
        }
        foreach ($word_lists as $word => $position_list) {
            $term_id = canonicalTerm($word);
            $meta_prefix = substr($word, 0, 5);
            $site_meta = ($meta_prefix == "site:" || $meta_prefix == "info:");
            $occurrences = $site_meta ? $doc_length : count($position_list);
            if (!$site_meta && $occurrences > 0) {
                $encoded_position_list = encodePositionList($position_list);
                $offset = $position_offset + strlen($positions);
                $len = strlen($encoded_position_list);
                $positions .= $encoded_position_list;
            } else {
                $offset = 0;
                $len = 0;
            }
            $last_entry = $last_entries_tools->find($last_entries, $term_id);
            if (empty($last_entry)) {
                list($last_index, $last_offset, $num_occurrences) = [0, 0, 0];
            } else {
                $last_entry_row = $last_entries_tools->unpack($last_entry);
                list($last_index, $last_offset, $num_occurrences) =
                    array_values($last_entry_row[0]);
            }
            $diff_doc_map_index = $doc_map_index - $last_index;
            $diff_offset = (!$site_meta && $occurrences > 0) ?
                $offset - $last_offset : 0;
            $entry = $postings_tools->pack(["DOC_INDEX" => $diff_doc_map_index,
                "FREQUENCY" => $occurrences, "POSITIONS_OFFSET" => $diff_offset,
                "POSITIONS_LEN" => $len]);
            $postings_tools->add($postings, $term_id, $entry,
                PackedTableTools::ADD_MEM_TABLE, PackedTableTools::APPEND_MODE);
            $add_entry = $last_entries_tools->pack(
                ["LAST_INDEX" => $doc_map_index, "LAST_OFFSET" => $offset,
                "NUM_OCCURRENCES" => $num_occurrences + $occurrences]);
            $last_entries_tools->add($last_entries, $term_id, $add_entry);
        }
    }
    /**
     * Checks if a doc_id correspons to a document or a link
     *
     * @param string $key to check if doc or not
     * @return bool true if a document
     */
    public function isDoc($key)
    {
        return $key[self::DOCID_PART_LEN << 1] == 'd';
    }
    /**
     * As pre-step to calculating the inverted index information for a parition
     * this method groups documents and links to documents into single objects.
     * It also does simple deduplication of documents that have the same hash.
     * It then returns an array of the grouped document data.
     *
     * @param int $partition index of partition to do deduplication for
     *  in the case that test index is empty
     * @param array $test_index is non-null only when doing testing of what
     *  this method does. In which case, it should consist of an array
     *  of $doc_id => string represent a possible record for that doc.
     *  As deduplication is done entirely based on component of the doc_id
     *  (hash_url, doc_type, hash_doc, hash_host) the string doesn't matter
     *  too much.
     * @return array groups doc_id => records associated with that doc_id
     */
    public function prepareIndexMap($partition, $test_index = [])
    {
        if (empty($test_index)) {
            $doc_index = $this->documents->loadPartitionIndex($partition, true);
        } else {
            $doc_index = $test_index;
        }
        $doc_ids = array_keys($doc_index);
        $num_ids = count($doc_ids);
        $grouped_urls = [];
        $grouped_hashes = [];
        $score = $num_ids;
        $doc_key_len = self::DOCID_PART_LEN;
        foreach ($doc_ids as $doc_id) {
            $hash_url = substr($doc_id, 0, $doc_key_len);
            if (empty($grouped_urls[$hash_url])) {
                $grouped_urls[$hash_url] = ["aux_docs" => [], 'score' => 0];
            }
            if ($this->isDoc($doc_id)) {
                $hash_code = substr($doc_id, $doc_key_len, $doc_key_len);
                if (empty($grouped_hashes[$hash_code])) {
                    $grouped_hashes[$hash_code] = [];
                }
                $grouped_hashes[$hash_code][] = $hash_url;
                $grouped_urls[$hash_url]['doc'] = $doc_id;
            } else {
                $grouped_urls[$hash_url]['aux_docs'][] = $doc_id;
            }
            $grouped_urls[$hash_url]['score'] += $score;
            $score--;
        }
        foreach ($grouped_hashes as $same_hash_group) {
            if (count($same_hash_group) > 1) {
                $max_score = 0;
                $max_url = "";
                foreach ($same_hash_group as $hash_url) {
                    if ($grouped_urls[$hash_url]['score'] > $max_score) {
                        $max_score = $grouped_urls[$hash_url]['score'];
                        $max_url = $hash_url;
                    }
                }
                foreach ($same_hash_group as $hash_url) {
                    if ($hash_url != $max_url) {
                        $grouped_urls[$max_url]['score'] +=
                            $grouped_urls[$hash_url]['score'];
                        if ($grouped_urls[$max_url]['aux_docs'] !=
                            "metas_only") {
                            $grouped_urls[$max_url]['aux_docs'][] =
                                $grouped_urls[$hash_url]['doc'];
                            $grouped_urls[$max_url]['aux_docs'] = array_merge(
                                $grouped_urls[$max_url]['aux_docs'],
                                $grouped_urls[$hash_url]['aux_docs']);
                        }
                        $grouped_urls[$hash_url]['aux_docs'] = "metas_only";
                    }
                }
            }
        }
        uasort($grouped_urls, function ($a, $b) {
            return intval($b['score'] - $a['score']);
        });
        return $grouped_urls;
    }
    /**
     * Forces the current shard to be saved
     */
    public function forceSave()
    {
        $this->buildInvertedIndexPartition();
    }
    /**
     * Used when a crawl stops to perform final dictionary operations
     * to produce a working stand-alone index.
     */
    public function stopIndexing()
    {
        $this->forceSave();
    }
    /**
     * Gets an array of posting list positions for each shard in the
     * bundle $index_name for the word id $term_id
     *
     * @param string $term_id id of phrase or word to look up in bundle
     *     dictionary
     * @param int $threshold after the number of results exceeds this amount
     *     stop looking for more dictionary entries.
     * @param int $start_generation what generation in the index to start
     *      finding occurrence of phrase from
     * @param int $num_distinct_generations from $start_generation how
     *      many generation to search forward to
     * @param bool $with_remaining_total whether to total number of
     *      postings found as well or not
     * @return array either [total, sequence of four tuples]
    *       or sequence of four tuples:
     *      (index_shard generation, posting_list_offset, length, exact id
     *      that match $term_id)
     */
    public function getWordInfo($term_id, $threshold = -1,
        $offset = 0, $num_partitions = -1,
        $with_remaining_total = false)
    {
        $dictionary = $this->dictionary ?? [];
        if (!$dictionary) {
            return [];
        }
        $result = $dictionary->get($term_id, true, true, false, $offset,
            $num_partitions);
        if (empty($result)) {
            $result = [];
        }
        $max_found_partition = 0;
        $doc_count = 0;
        $occurrence_count = 0;
        $num_rows = 0;
        $threshold_met = false;
        $save_partition = $this->documents->parameters["SAVE_PARTITION"];
        if (empty($result['ROWS'])) {
            $result['ROWS'] = [];
        }
        foreach ($result['ROWS'] as $row) {
            if ($threshold > 0 && $doc_count > $threshold) {
                $result['ROWS'] = array_slice($result['ROWS'], 0, $num_rows);
                $threshold_met = true;
                break;
            }
            $max_found_partition = ($max_found_partition < $row['PARTITION']) ?
                $row['PARTITION'] : $max_found_partition;
            $doc_count += $row['NUM_DOCS'];
            $occurrence_count += $row['NUM_OCCURRENCES'];
            $num_rows++;
        }
        $parameters = $this->documents->parameters;
        $result['TOTAL_NUM_DOCS'] = $parameters["VISITED_URLS_COUNT"];
        $result['TOTAL_NUM_LINKS_AND_DOCS'] = $parameters["ACTIVE_COUNT"] +
            $parameters["COUNT"];
        $result['MAX_ITEMS_PER_PARTITION'] = $parameters["MAX_ITEMS_PER_FILE"];
        $result['TOTAL_NUMBER_OF_PARTITIONS'] = $parameters["SAVE_PARTITION"]
            + 1;
        if ($threshold_met) {
            $fraction_seen = ($save_partition - $offset) /
                ($max_found_partition - $offset);
            $result['TOTAL_COUNT'] = $fraction_seen * $doc_count;
            $result['TOTAL_OCCURRENCES'] = $fraction_seen * $occurrence_count;
            $result['THESHOLD_EXCEEDED'] = true;
            return $result;
        }
        $base_folder = $this->getPartitionBaseFolder($save_partition);
        $postings_filename = $base_folder . "/" . self::POSTINGS_FILENAME;
        $postings_tool = $this->postings_tools;
        if (file_exists($postings_filename)) {
            $active_dictionary = $postings_tool->load($postings_filename);
            $active_postings_entry = $postings_tool->find($active_dictionary,
                $term_id);
            $active_postings = (empty($active_postings_entry)) ? [] :
                $postings_tool->unpack($active_postings_entry);
        }
        if (!empty($active_postings)) {
            $row = ["PARTITION" => $save_partition,
                "NUM_DOCS" => count($active_postings),
                "POSTINGS" => $active_postings];
            $doc_count += $row["NUM_DOCS"];
            $active_occurrences = $this->deDeltaPostingsSumFrequencies(
                $row["POSTINGS"]);
            $row['NUM_OCCURRENCES'] = $active_occurrences;
            $occurrence_count += $active_occurrences;
            $result['ROWS'][] = $row;
        }
        $result['TOTAL_COUNT'] = $doc_count;
        $result['TOTAL_OCCURRENCES'] = $occurrence_count;
        return $result;
    }
    /**
     * Within postings DOC_INDEX and POSITION_OFFSETS to position lists are
     * stored as delta lists (difference over previous values), this method
     * undoes the delta list to restore the actual DOC_INDEX and
     * POSITION_OFFSETS values. It also computes the of the frequencies of items
     * within the list of postings.
     *
     * @param array& $postings a reference to an array of posting lists for a
     *  term (this will be changed by this method)
     * @return int sum of the frequencies of term occurrences as given by the
     *  above postings
     */
    public function deDeltaPostingsSumFrequencies(&$postings)
    {
        if (empty($postings) || !is_array($postings)) {
            return 0;
        }
        $sum_frequencies = $postings[0]["FREQUENCY"];
        $doc_index = $postings[0]["DOC_INDEX"];
        $positions_offset = $postings[0]["POSITIONS_OFFSET"];
        $num_postings = count($postings);
        for ($i = 1; $i < $num_postings; $i++) {
            $sum_frequencies += $postings[$i]["FREQUENCY"];
            $doc_index += $postings[$i]["DOC_INDEX"];
            $positions_offset += $postings[$i]["POSITIONS_OFFSET"];
            $postings[$i]["DOC_INDEX"] = $doc_index;
            $postings[$i]["POSITIONS_OFFSET"] = $positions_offset;
        }
        return $sum_frequencies;
    }
    /**
     * Gets the description, count of documents, and number of partitions of the
     * documents store in the supplied directory. If the file
     * arc_description.txt exists, this is viewed as a dummy index archive for
     * the sole purpose of allowing conversions of downloaded data such as arc
     * files into Yioop! format.
     *
     * @param string $dir_name path to a directory containing a documents
     *      IndexDocumentBundle
     * @return array summary of the given archive
     */
    public static function getArchiveInfo($dir_name)
    {
        if (file_exists($dir_name . "/arc_description.txt")) {
            $crawl = [];
            $info = [];
            $crawl['DESCRIPTION'] = substr(
                file_get_contents($dir_name . "/arc_description.txt"), 0, 256);
            $crawl['ARCFILE'] = true;
            $info['VISITED_URLS_COUNT'] = 0;
            $info['COUNT'] = 0;
            $info['NUM_DOCS_PER_PARTITION'] = 0;
            $info['WRITE_PARTITION'] = 0;
            $info["VERSION"] = self::DEFAULT_VERSION;
            $info['DESCRIPTION'] = serialize($crawl);
            return $info;
        }
        $info_path = $dir_name . "/" . self::ARCHIVE_INFO_FILE;
        if (!file_exists($info_path)) {
            $info = [];
            $info['DESCRIPTION'] =
                "Archive does not exist OR Archive description file not found";
            $info['COUNT'] = 0;
            $info['NUM_DOCS_PER_PARTITION'] = -1;
            $info["VERSION"] = self::DEFAULT_VERSION;
            return $info;
        }
        $info = unserialize(file_get_contents($info_path)) ?? [];
        $table_info = PartitionDocumentBundle::getParameterInfo($dir_name . "/".
            self::DOCUMENTS_FOLDER);
        return array_merge($info, $table_info);
    }
    /**
     * Sets the archive info struct for the web archive bundle associated with
     * this bundle. This struct has fields like: DESCRIPTION
     * (serialzied store of global parameters of the crawl like seed sites,
     * timestamp, etc).
     *
     * @param string $dir_name folder with archive bundle
     * @param array $info struct with above fields
     */
    public static function setArchiveInfo($dir_name, $info)
    {
        $archive_info_path = $dir_name. "/" . self::ARCHIVE_INFO_FILE;
        if (empty($info["VERSION"])) {
            $info["VERSION"] = self::DEFAULT_VERSION;
        }
        file_put_contents($archive_info_path, serialize($info));
    }
    /**
     * Returns the last time the archive info of the bundle was modified.
     *
     * @param string $dir_name folder with archive bundle
     * @returb mixed either time if file exists or false
     */
    public static function getParamModifiedTime($dir_name)
    {
        $doc_param_path = $dir_name . "/" . self::DOCUMENTS_FOLDER . "/" .
            PartitionDocumentBundle::PARAMETERS_FILE;
        if (file_exists($doc_param_path)) {
            clearstatcache();
            return filemtime($doc_param_path);
        }
        return false;
    }
}
ViewGit