Last commit for src/executables/Mirror.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\executables;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\library\CrawlConstants;
use seekquarry\yioop\library\CrawlDaemon;
use seekquarry\yioop\library\FetchUrl;

if (php_sapi_name() != 'cli' ||
    defined("seekquarry\\yioop\\configs\\IS_OWN_WEB_SERVER")) {
    echo "BAD REQUEST"; exit();
}
/** CRAWLING means don't try to use cache
 * @ignore
 */
$_SERVER["USE_CACHE"] = false;
/** for crawlHash and crawlLog and Yioop constants */
require_once __DIR__."/../library/Utility.php";
if (!C\PROFILE) {
    echo "Please configure the search engine instance by visiting" .
        "its web interface on localhost.\n";
    exit();
}
ini_set("memory_limit", C\MIRROR_MEMORY_LIMIT);
/*
 * We'll set up multi-byte string handling to use UTF-8
 */
mb_internal_encoding("UTF-8");
mb_regex_encoding("UTF-8");
/**
 * This class is responsible for syncing crawl archives between machines using
 * the SeekQuarry/Yioop search engine
 *
 * Mirror periodically queries the queue server asking for a list of files that
 * have changed in its parent since the last sync time. It then proceeds to
 * download them.
 *
 * @author Chris Pollett
 */
class Mirror implements CrawlConstants
{
    /**
     * Reference to a database object. Used since has directory manipulation
     * functions
     * @var object
     */
    public $db;
    /**
     * Url or IP address of the name_server to get sites to crawl from
     * @var string
     */
    public $name_server;

    /**
     * Last time a sync list was obtained from master machines
     * @var string
     */
    public $last_sync;
    /**
     * Last time the machine being mirrored was notified Mirror.php is still
     * running
     * @var string
     */
    public $last_notify;
    /**
     * File name where last sync time is written
     * @var string
     */
    public $last_sync_file;
    /**
     * Time of start of current sync
     * @var string
     */
    public $start_sync;
    /**
     * Files to download for current sync
     * @var string
     */
    public $sync_schedule;
    /**
     * Directory to sync
     * @var string
     */
    public $sync_dir;
    /**
     * Url of the Yioop instance we are mirroring
     * @var string
     */
    public $parent_url;
    /**
     * Maximum number of bytes from a file to download in one go
     */
    const DOWNLOAD_RANGE = 50000000;
    /**
     * Sets up the field variables so that syncing can begin
     *
     * @param string $name_server URL or IP address of the name server
     */
    public function __construct($name_server)
    {
        $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager";
        $this->db = new $db_class();
        $this->name_server = $name_server;
        $this->last_sync_file = C\CRAWL_DIR . "/schedules/last_sync.txt";
        if (file_exists($this->last_sync_file)) {
            $this->last_sync = unserialize(
                file_get_contents($this->last_sync_file));
        } else {
            $this->last_sync = 0;
        }
        $this->start_sync = $this->last_sync;
        $this->last_notify = $this->last_sync;
        $this->sync_schedule = [];
        $this->sync_dir = C\CRAWL_DIR . "/cache";
        $this->parent_url = $name_server;
    }
    /**
     * This is the function that should be called to get the mirror to start
     * syncing. Calls init to handle the command line arguments then enters
     * the syncer's main loop
     */
    public function start()
    {
        global $argv;
        CrawlDaemon::init($argv, "Mirror");
        L\crawlLog("\n\nInitialize logger..", "mirror", true);
        $this->loop();
    }
    /**
     * Main loop for the mirror script.
     *
     */
    public function loop()
    {
        L\crawlLog("In Sync Loop");
        L\crawlLog("PHP Version in use: " . phpversion());
        $info[self::STATUS] = self::CONTINUE_STATE;
        while (CrawlDaemon::processHandler()) {
            $syncer_message_file = C\CRAWL_DIR .
                "/schedules/MirrorMessages.txt";
            if (file_exists($syncer_message_file)) {
                $info = unserialize(file_get_contents($syncer_message_file));
                unlink($syncer_message_file);
                if (isset($info[self::STATUS]) &&
                    $info[self::STATUS] == self::STOP_STATE) {
                    continue;
                }
            }
            $parent_file = C\CRAWL_DIR . "/schedules/mirror_parent.txt";
            if (file_exists($parent_file)) {
                $this->parent_url = file_get_contents($parent_file);
                L\crawlLog("Read File: " . $parent_file . ".");
                L\crawlLog("Set parent server to: " . $this->parent_url);
            } else {
                L\crawlLog("File: " . $parent_file . " does not exist. ");
                L\crawlLog("Assuming parent is name server: ".
                    $this->name_server);
                $this->parent_url = $this->name_server;
            }
            $info = $this->checkScheduler();
            if ($info === false) {
                L\crawlLog("Cannot connect to parent server...".
                    " will try again in ".
                    C\MIRROR_NOTIFY_FREQUENCY." seconds.");
                sleep(C\MIRROR_NOTIFY_FREQUENCY);
                continue;
            }
            if ($info[self::STATUS] == self::NO_DATA_STATE) {
                L\crawlLog("No data from parent server. Sleeping...");
                sleep(C\MIRROR_NOTIFY_FREQUENCY);
                continue;
            }
            $this->copyNextSyncFile();
        } //end while
        L\crawlLog("Mirror shutting down!!");
    }
    /**
     * Gets status and, if done processing all other mirroring activities,
     * gets a new list of files that have changed since the last synchronization
     * from the web app of the machine we are mirroring with.
     *
     * @return mixed array or bool. Returns false if weren't successful in
     *     contacting web app, otherwise, returns an array with a status
     *     and potentially a list of files ot sync
     */
    public function checkScheduler()
    {
        $info = [];
        $server = $this->parent_url;
        $start_time = microtime(true);
        $time = time();
        $session = md5($time . C\AUTH_KEY);
        $write_sync_time = true;
        $request =
            $server .
            "?c=resource&time=$time&session=$session" .
            "&robot_instance=" . C\ROBOT_INSTANCE . "&machine_uri=" .
            C\WEB_URI . "&last_sync=" . $this->last_sync;
        if ($this->start_sync <= $this->last_sync &&
            $this->last_sync + C\MIRROR_SYNC_FREQUENCY < $time) {
            $request .= "&a=syncList";
            L\crawlLog("Getting Sync List...");
            $info_string = FetchUrl::getPage($request, null, true);
            if ($info_string === false) {
                return false;
            }
            $this->last_notify = $time;
            $info_string = trim($info_string);
            $info = unserialize(gzuncompress(base64_decode($info_string)));
            if (isset($info[self::STATUS]) &&
                $info[self::STATUS] == self::CONTINUE_STATE) {
                $this->start_sync = $time;
                $this->sync_schedule = $info[self::DATA];
                unset($info[self::DATA]);
            } else if (isset($info[self::STATUS]) &&
                $info[self::STATUS] == self::NO_DATA_STATE) {
                $this->last_sync = $time;
                $this->start_sync = $time;
                $write_sync_time = false;
            }
        } else {
            $info[self::STATUS] = ($this->last_sync == $this->start_sync) ?
                self::NO_DATA_STATE : self::CONTINUE_STATE;
            L\crawlLog("Current time $time, last notify time ".
                $this->last_notify."...");
            if ($time - $this->last_notify > C\MIRROR_NOTIFY_FREQUENCY) {
                $request .= "&a=syncNotify";
                FetchUrl::getPage($request, null, true);
                $this->last_notify = $time;
                L\crawlLog("Notifying master that mirror is alive..");
            } else {
                L\crawlLog("So not notifying scheduler..");
            }
        }
        if (count($this->sync_schedule) == 0 && $write_sync_time) {
            $this->last_sync = $this->start_sync;
            $this->db->setWorldPermissionsRecursive($this->sync_dir, true);
            file_put_contents($this->last_sync_file,
                serialize($this->last_sync));
        }
        L\crawlLog("  Time to check Scheduler ".
            L\changeInMicrotime($start_time));
        return $info;
    }
    /**
     * Downloads the next file from the schedule of files to download received
     * from the web app.
     */
    public function copyNextSyncFile()
    {
        $dir = $this->sync_dir;
        $server = $this->parent_url;
        $time = time();
        $session = md5($time . C\AUTH_KEY);
        if (count($this->sync_schedule) <= 0) {
            return;
        }
        $file = array_pop($this->sync_schedule);
        L\crawlLog("Start syncing {$file['name']}..");
        if ($file['is_dir'] ) {
            if (!file_exists("$dir/{$file['name']}")) {
                mkdir("$dir/{$file['name']}");
                L\crawlLog(".. {$file['name']} directory created.");
            } else {
                L\crawlLog(".. {$file['name']} directory exists.");
            }
        } else {
            $request =
                "$server?c=resource&a=get&time=$time&session=$session".
                "&f=cache&n=" . urlencode($file["name"]);
            if ($file["size"] < self::DOWNLOAD_RANGE) {
                $data = FetchUrl::getPage($request, null, true);
                if ($file["size"] != strlen($data)) {
                    array_push($this->sync_schedule, $file);
                    L\crawlLog(".. {$file['name']} error ".
                        "downloading, retrying.");
                    return;
                }
                file_put_contents("$dir/{$file['name']}", $data);
                L\crawlLog(".. {$file['name']} file copied.");
            } else {
                $offset = 0;
                $fh = fopen("$dir/{$file['name']}", "wb");
                $request .= "&l=" . self::DOWNLOAD_RANGE;
                while($offset < $file['size']) {
                    $data = FetchUrl::getPage($request . "&o=$offset", null,
                        true);
                    $old_offset = $offset;
                    $offset += self::DOWNLOAD_RANGE;
                    $end_point = min($offset, $file["size"]);
                    //crude check if we need to redownload segment
                    if (strlen($data) != ($end_point - $old_offset)) {
                        $offset = $old_offset;
                        L\crawlLog(".. Download error re-requesting segment");
                        continue;
                    }
                    fwrite($fh, $data);
                    L\crawlLog(".. {$file['name']} downloaded bytes ".
                        "$old_offset to $end_point..");
                }
                L\crawlLog(".. {$file['name']} file copied.");
                fclose($fh);
            }
        }
    }
}
/*
 * Instantiate and runs the Mirror program
 */
$syncer =  new Mirror(C\NAME_SERVER);
$syncer->start();
ViewGit