Last commit for src/executables/DictionaryUpdater.php: afd6930f42e31d81a53d42061b5fd758f56c62de

First pass at modifying Yioop to again use Logarithmic Merge Trees for its dictionary structures

Chris Pollett [2024-01-15 02:Jan:th]
First pass at modifying Yioop to again use Logarithmic Merge Trees for its dictionary structures
Folder structure of IndexDocumentBundles also modified and now supports overflow folder (which
could be on a different hard drive). ArcTool has been updated to support migration to new
indexes
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\executables;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\library\media_jobs as MJ;
use seekquarry\yioop\library\CrawlConstants;
use seekquarry\yioop\library\IndexDocumentBundle;

if (php_sapi_name() != 'cli' ||
    defined("seekquarry\\yioop\\configs\\IS_OWN_WEB_SERVER")) {
    echo "BAD REQUEST"; exit();
}
/**  For crawlHash, crawlHashWord function */
require_once __DIR__."/../library/Utility.php";

ini_set("memory_limit", C\ARC_TOOL_MEMORY_LIMIT);   /*reading in a whole
    shard might take a fair bit of memory
*/
/*
 * We'll set up multi-byte string handling to use UTF-8
 */
mb_internal_encoding("UTF-8");
mb_regex_encoding("UTF-8");
/**
 *
 *
 * @author Chris Pollett
 */
class DictionaryUpdater implements CrawlConstants
{
    /**
     * The main code for the dictionary updater, updates the the
     * dictionary for the IndexDocumentBundle at $bundle_path running
     * on channel $channel from its current next_partition to process
     * to the current save partition. Partitions are groups of documents
     * that have been downloaded, but whose words ave not necessarily been
     * add to the dicitionary for the bundle.
     *
     * @param int $channel the channel the crawl is running on. Used in
     *  naming lock files
     * @param string $bundle_path the path to the IndexDocumentBundle or
     *  FeedDucumentBundle we are adding dictionary info for
     */
    public static function run($channel, $bundle_path)
    {
        if (!isset($_SERVER["LOG_TO_FILES"])) {
            $_SERVER["LOG_TO_FILES"] = true;
            $process_name = $channel . "-DictionaryUpdater";
            L\crawlLog("\n\nInitialize logger..", $process_name, true);
        }
        $archive_type = self::getArchiveKind($bundle_path);
        if (!$archive_type) {
            L\crawlLog("Unknown Archive Type Exiting...");
            exit();
        }
        $archive_name = C\NS_LIB . $archive_type;
        if ($archive_type == "FeedDocumentBundle") {
            $dbms_manager = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager";
            $db = new $dbms_manager();
            $db->connect();
            $index_archive = new $archive_name($bundle_path, $db,
                false, null, C\NUM_DOCS_PER_PARTITION);
            $sql = "SELECT * FROM MEDIA_SOURCE WHERE (TYPE='rss'
                 OR TYPE='html' OR TYPE='json' OR TYPE='regex')";
            $result = $db->execute($sql);
            $feeds = [];
            while ($feed = $db->fetchArray($result)) {
                MJ\FeedsUpdateJob::parseFeedAuxInfo($feed);
                $feeds[] = $feed;
            }
            $index_archive->feeds = $feeds;
        } else {
            $index_archive = new $archive_name($bundle_path,
                false, "", C\NUM_DOCS_PER_PARTITION);
        }
        $next_partition_path = $bundle_path . "/".
            IndexDocumentBundle::NEXT_PARTITION_FILE;
        if (file_exists($next_partition_path)) {
            $index_archive->next_partition_to_add = intval(
                file_get_contents($next_partition_path));
        } else {
            $index_archive->next_partition_to_add = 0;
            file_put_contents($next_partition_path,
                $index_archive->next_partition_to_add);
        }
        //note the false parameter means will only update for one partition
        $index_archive->updateDictionary($next_partition_path, false);
    }
    /**
     * Given a folder name, determines the kind of bundle (if any) it holds.
     * It does this based on the expected location of the description.txt file,
     * or arc_description.ini (in the case of a non-yioop archive)
     *
     * @param string $archive_path the path to archive folder
     * @return string the archive bundle type, either: WebArchiveBundle or
     *     IndexArchiveBundle
     */
    public static function getArchiveKind($archive_path)
    {
        if (file_exists("$archive_path/bundle0")) {
            if (file_exists("$archive_path/bundle0/summaries/description.txt")){
                return "DoubleIndexBundleOld";
            } else {
                return "DoubleIndexBundle";
            }
        }
        if (file_exists("$archive_path/description.txt")) {
            return "WebArchiveBundle";
        }
        if (file_exists("$archive_path/filter_a.ftr")) {
            if (file_exists("$archive_path/summaries/description.txt")) {
                return "FeedArchiveBundle";
            } else {
                return "FeedDocumentBundle";
            }
        }
        if (file_exists("$archive_path/summaries/description.txt")) {
            return "IndexArchiveBundle";
        } else if (file_exists("$archive_path/archive_info.txt")) {
            return "IndexDocumentBundle";
        }
        $desc_path = "$archive_path/arc_description.ini";
        if (file_exists($desc_path)) {
            $desc = L\parse_ini_with_fallback($desc_path);
            if (!isset($desc['arc_type'])) {
                return false;
            }
            return $desc['arc_type'];
        }
        return false;
    }
}

if (!empty($argv[3]) && in_array($argv[1], ["run", "terminal"])) {
    /*
     * Instantiate and runs the QueueSever
     */
    if ($argv[1] == "terminal") {
        $_SERVER["LOG_TO_FILES"] = "terminal";
    }
    DictionaryUpdater::run($argv[2], $argv[3]);
}
ViewGit