Last commit for src/controllers/FetchController.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2024  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2024
 * @filesource
 */
namespace seekquarry\yioop\controllers;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\library\CrawlConstants;
use seekquarry\yioop\library\CrawlDaemon;
use seekquarry\yioop\library\MessagesBundle;
use seekquarry\yioop\library\classifiers\Classifier;

/**
 * This class handles data coming to a queue_server from a fetcher
 * Basically, it receives the data from the fetcher and saves it into
 * various files for later processing by the queue server.
 * This class can also be used by a fetcher to get status information.
 *
 * @author Chris Pollett
 */
class FetchController extends Controller implements CrawlConstants
{
    /**
     * These are the activities supported by this controller
     * @var array
     */
    public $activities = ["schedule", "archiveSchedule", "update", "crawlTime"];
    /**
     * File used to store info about the status of a queue server's
     * active crawl. Default to channel 0 but might change in
     * @see processRequest
     * @var string
     */
    public $crawl_status_file_name =
        C\SCHEDULES_DIR . "/0-" . self::crawl_status_file;
    /**
     * Number of seconds that must elapse after last call before doing
     * cron activities (mainly check liveness of fetchers which should be
     * alive)
     */
    const CRON_INTERVAL = 300;
    /**
     * Checks that the request seems to be coming from a legitimate fetcher then
     * determines which activity the fetcher is requesting and calls that
     * activity for processing.
     */
    public function processRequest()
    {
        // to allow the calculation of longer archive schedules
        if (!$this->web_site->isCli()) {
            ini_set('max_execution_time', 60);
        }
        $data = [];
        /* do a quick test to see if this is a request seems like
           from a legitimate machine
         */
        if (!$this->checkRequest()) {
            $this->web_site->header("HTTP/1.0 400 Bad Request");
            $_REQUEST['p'] = "400";
            $_REQUEST['c'] = "static";
            \seekquarry\yioop\bootstrap($this->web_site, false);
            return;
        }
        $activity = $_REQUEST['a'];
        $channel = $this->getChannel();
        $this->crawl_status_file_name =
            C\SCHEDULES_DIR . "/{$channel}-" . self::crawl_status_file;
        $robot_table_name = C\WORK_DIRECTORY . "/{$channel}-" .
            self::robot_table_name;
        $robot_table = [];
        if (file_exists($robot_table_name)) {
            $robot_table = unserialize(file_get_contents($robot_table_name));
        }
        if (isset($_REQUEST['robot_instance']) &&
            (isset($_REQUEST['machine_uri'])) && is_array($robot_table)) {
            $robot_instance =
                $this->clean($_REQUEST['robot_instance'], "string") ?? 0;
            $robot_table[$robot_instance] = [L\remoteAddress(),
                $this->clean($_REQUEST['machine_uri'], "string"),
                time()];
            file_put_contents($robot_table_name, serialize($robot_table),
                LOCK_EX);
        }
        if (in_array($activity, $this->activities)) {
            $this->call($activity);
        }
    }
    /**
     * Returns the channel used by the given uploaded data
     *
     * @return int channel used
     */
    public function getChannel()
    {
        $channel = 0;
        if (!empty($_REQUEST['robot_instance'])) {
            $instance_parts = explode("-", $_REQUEST['robot_instance']);
            $channel = (empty($instance_parts[1])) ? 0 : $instance_parts[1];
        }
        return $channel;
    }
    /**
     * Checks if there is a schedule of sites to crawl available and
     * if so present it to the requesting fetcher, and then delete it.
     */
    public function schedule()
    {
        $view = "fetch";
        // set up query
        $data = [];
        if (isset($_REQUEST['crawl_time'])) {;
            $crawl_time = substr($this->clean($_REQUEST['crawl_time'], 'int'),
                0, C\TIMESTAMP_LEN);
        } else {
            $crawl_time = 0;
        }
        $info = [];
        $info[self::STATUS] = self::NO_DATA_STATE;
        $data['MESSAGE'] = base64_encode(serialize($info)) . "\n";
        $message = "";
        if ($crawl_time > 0) {
            $messages_folder = C\SCHEDULES_DIR . "/" .
                self::messages_data_base_name .  $crawl_time;
            $messages_bundle = new MessagesBundle($messages_folder);
            $messages_bundle->initWindows();
            // get a schedule if exists, else false
            $message = $messages_bundle->extractSendWindow();
            if (!$message) {
                $message = $data['MESSAGE'];
            }
            /* this gets a ready for further processing file from receive window
               if exists, and moves it to other folder for further processing
             */
            $packed_data = $messages_bundle->extractReceiveWindow();
            if (!empty($packed_data)) {
                list($byte_counts, $upload_data) = unserialize(
                    $packed_data);
                $this->handleUploadedData($byte_counts,
                    $upload_data, $messages_bundle);
            }
        }
        $data['MESSAGE'] = $message;
        $this->displayView($view, $data);
    }
    /**
     * Checks to see whether there are more pages to extract from the current
     * archive, and if so returns the next batch to the requesting fetcher. The
     * iteration progress is automatically saved on each call to nextPages, so
     * that the next fetcher will get the next batch of pages. If there is no
     * current archive to iterate over, or the iterator has reached the end of
     * the archive then indicate that there is no more data by setting the
     * status to NO_DATA_STATE.
     */
    public function archiveSchedule()
    {
        $view = "fetch";
        $request_start = time();
        if (isset($_REQUEST['crawl_time'])) {;
            $crawl_time = substr($this->clean($_REQUEST['crawl_time'], 'int'),
                0, C\TIMESTAMP_LEN);
        } else {
            $crawl_time = 0;
        }
        $channel = $this->getChannel();
        $messages_filename = CrawlDaemon::getMessageFileName("NameServer",
            $channel);
        $lock_filename = CrawlDaemon::getLockFileName("NameServer",
            $channel);
        if ($crawl_time > 0 && file_exists($messages_filename)) {
            $fetch_pages = true;
            $info = unserialize(file_get_contents($messages_filename));
            if ($info[self::STATUS] == 'STOP_CRAWL') {
                /* The stop crawl message gets created by the admin_controller
                   when the "stop crawl" button is pressed.*/
                if (file_exists($messages_filename)) {
                    unlink($messages_filename);
                }
                if (file_exists($lock_filename)) {
                    unlink($lock_filename);
                }
                $fetch_pages = false;
                $info = [];
            }
        } else {
            $fetch_pages = false;
            $info = [];
        }
        $pages = [];
        $got_lock = true;
        if (file_exists($lock_filename)) {
            $lock_time = unserialize(file_get_contents($lock_filename));
            if ($request_start - $lock_time < ini_get('max_execution_time')){
                $got_lock = false;
            }
        }
        $chunk = false;
        $archive_iterator = null;
        if ($fetch_pages && $got_lock) {
            file_put_contents($lock_filename, serialize($request_start));
            if ($info[self::ARC_DIR] == "MIX" ||
                    file_exists($info[self::ARC_DIR])) {
                $iterate_timestamp = $info[self::CRAWL_INDEX];
                $result_timestamp = $crawl_time;
                $result_dir = C\SCHEDULES_DIR .
                    "/" . self::name_archive_iterator . $crawl_time;
                $arctype = $info[self::ARC_TYPE];
                $iterator_name = C\NS_ARCHIVE . $arctype . "Iterator";
                try {
                    if ($info[self::ARC_DIR] == "MIX") {
                        //recrawl of crawl mix case
                        $archive_iterator = new $iterator_name(
                            $iterate_timestamp, $result_timestamp);
                    } else {
                        //any other archive crawl except web archive recrawls
                        $archive_iterator = new $iterator_name(
                            $iterate_timestamp, $info[self::ARC_DIR],
                            $result_timestamp, $result_dir);
                    }
                } catch (\Exception $e) {
                    $info['ARCHIVE_BUNDLE_ERROR'] =
                        "Invalid bundle iterator: '{$iterator_name}' \n".
                        $e->getMessage();
                }
            }
            $pages = false;
            if ($archive_iterator && !$archive_iterator->end_of_iterator) {
                if (L\generalIsA($archive_iterator,
                    C\NS_ARCHIVE . "TextArchiveBundleIterator")) {
                    $pages = $archive_iterator->nextChunk();
                    $chunk = true;
                } else {
                    $pages = $archive_iterator->nextPages(
                        C\ARCHIVE_BATCH_SIZE);
                }
            }
            if (file_exists($lock_filename)) {
                unlink($lock_filename);
            }
        }
        if (!empty($archive_iterator) && $archive_iterator->end_of_iterator) {
            $info[self::END_ITERATOR] = true;
        }
        if ((!empty($chunk) && !isset($pages)) || (!empty($pages))) {
            $pages_string = L\webencode(gzcompress(serialize($pages)));
        } else {
            $info[self::STATUS] = self::NO_DATA_STATE;
            $info[self::POST_MAX_SIZE] = L\metricToInt(
                ini_get("post_max_size"));
            $pages = [];
            $pages_string = L\webencode(gzcompress(serialize($pages)));
        }
        $info[self::DATA] = $pages_string;
        $info_string = serialize($info);
        $data['MESSAGE'] = $info_string;
        $this->displayView($view, $data);
    }
    /**
     * Processes Robot, To Crawl, and Index data sent from a fetcher
     * Acknowledge to the fetcher if this data was received okay.
     */
    public function update()
    {
        $view = "fetch";
        $info_flag = false;
        $logging = "";
        $necessary_fields = ['byte_counts', 'current_part', 'hash_data',
            'hash_part', 'num_parts', 'part', 'sequence_number', 'diff_origin'];
        $part_flag = true;
        $missing = "";
        $channel = $this->getChannel();
        foreach ($necessary_fields as $field) {
            if (!isset($_REQUEST[$field])) {
                $part_flag = false;
                $missing = $field;
            }
        }
        $crawl_type = $this->clean($_REQUEST['crawl_type'] ?? "", 'string');
        $arc_type = $this->clean($_REQUEST['arc_type'] ?? "", 'string');
        $crawl_time = substr($this->clean($_REQUEST['crawl_time'] ?? "0",'int'),
                0, C\TIMESTAMP_LEN);
        if ($part_flag &&
            L\crawlHash($_REQUEST['part']) == $_REQUEST['hash_part']) {
            $upload = false;
            if (!file_exists(C\TEMP_DIR)) {
                mkdir(C\TEMP_DIR);
                L\setWorldPermissionsRecursive(C\TEMP_DIR . "/");
            }
            $filename = C\TEMP_DIR . "/" . $_REQUEST['hash_data'];
            file_put_contents($filename, $_REQUEST['part'], FILE_APPEND);
            L\setWorldPermissions($filename);
            if (intval($_REQUEST['num_parts']) > 1) {
                $info_flag = true;
                if ($_REQUEST['num_parts'] == $_REQUEST['current_part']) {
                    $upload = true;
                }
            } else if (intval($_REQUEST['num_parts']) == 1) {
                $info_flag = true;
                $upload = true;
            }
            if ($upload) {
                $messages_folder = C\SCHEDULES_DIR . "/" .
                    self::messages_data_base_name .  $crawl_time;
                $messages_bundle = new MessagesBundle($messages_folder);
                $messages_bundle->initWindows();
                $diff_origin = filter_var($_REQUEST['diff_origin'],
                    FILTER_VALIDATE_BOOLEAN);
                $sequence_number = intval($_REQUEST['sequence_number']);
                $byte_counts = [];
                if (!empty($_REQUEST['byte_counts'])) {
                    $byte_counts = unserialize(
                        L\webdecode($_REQUEST['byte_counts']));
                }
                $schedule_time = $byte_counts["SCHEDULE_TIME"] ?? false;
                $upload_old = false;
                if (file_exists($this->crawl_status_file_name)) {
                    $crawl_status = unserialize(file_get_contents(
                        $this->crawl_status_file_name));
                    if (!empty($crawl_status['REPEAT_TIME']) &&
                        ($schedule_time &&
                        $crawl_status['REPEAT_TIME'] > $schedule_time)) {
                        $upload_old = true;
                    }
                }
                if ($upload_old) {
                    $logging = "DoubleIndexArchive has switched index!\n" .
                        "Discarding old index upload data!";
                    unlink($filename);
                } else {
                    $logging = "... Data received successfully\n";
                    $upload_data = file_get_contents($filename);
                    unlink($filename);
                    if (!$diff_origin && ($crawl_type != self::ARCHIVE_CRAWL ||
                        $arc_type == "WebArchiveBundle" || $arc_type == "")) {
                        $messages_bundle->updateReceiveWindow($sequence_number,
                            serialize([$byte_counts, $upload_data]));
                        $packed_data = $messages_bundle->extractReceiveWindow();
                        if (!empty($packed_data)) {
                            list($byte_counts, $upload_data) = unserialize(
                                $packed_data);
                        } else {
                            $upload_data = "";
                            $byte_counts = [];
                        }
                    }
                    $this->handleUploadedData($byte_counts,
                        $upload_data, $messages_bundle);
                }
            } else {
                $logging = "...".(
                    $_REQUEST['current_part']/$_REQUEST['num_parts']).
                    " of data uploaded.";
            }
        }
        $info =[];
        $info['REPEAT_TIME'] = $crawl_status['REPEAT_TIME'] ?? 0;
        $info['SLEEP_START'] = $crawl_status['SLEEP_START'] ?? "00:00";
        $info['SLEEP_DURATION'] = $crawl_status['SLEEP_DURATION'] ?? 0;
        if ($logging != "") {
            $info[self::LOGGING] = $logging;
        }
        if ($info_flag == true) {
            $info[self::STATUS] = self::CONTINUE_STATE;
        } else {
            $info[self::STATUS] = self::REDO_STATE;
            if (!$part_flag) {
                $info[self::SUMMARY] = "Missing request field: $missing.";
            } else {
                $info[self::SUMMARY] = "Hash of uploaded data was:".
                    L\crawlHash($_REQUEST['part']).". Sent checksum was:".
                    $_REQUEST['hash_part'];
            }
        }
        $info[self::MEMORY_USAGE] = memory_get_peak_usage();
        $info[self::POST_MAX_SIZE] = L\metricToInt(ini_get("post_max_size"));
        if (file_exists($this->crawl_status_file_name)) {
            $change = false;
            $crawl_status = unserialize(file_get_contents(
                $this->crawl_status_file_name));
            //want to make sure $crawl_status has data if will change
            if (!empty($crawl_status['CRAWL_TIME'])) {
                if (isset($_REQUEST['fetcher_peak_memory'])) {
                    if (!isset($crawl_status['FETCHER_MEMORY']) ||
                        $_REQUEST['fetcher_peak_memory'] >
                        $crawl_status['FETCHER_PEAK_MEMORY']
                    ) {
                        $crawl_status['FETCHER_PEAK_MEMORY'] =
                            $_REQUEST['fetcher_peak_memory'];
                        $change = true;
                    }
                }
                if (!isset($crawl_status['WEBAPP_PEAK_MEMORY']) ||
                    $info[self::MEMORY_USAGE] >
                    $crawl_status['WEBAPP_PEAK_MEMORY']) {
                    $crawl_status['WEBAPP_PEAK_MEMORY'] = $info[self::MEMORY_USAGE];
                    $change = true;
                }
            }
            if (!isset($crawl_status[self::CRAWL_TIME])) {
                $network_filename = C\SCHEDULES_DIR .
                    "/{$channel}-" . self::network_status_file;
                if (file_exists($network_filename)) {
                    $net_info = unserialize(file_get_contents(
                        $network_filename));
                    $info[self::CRAWL_TIME] = $net_info[self::CRAWL_TIME] ?? 0;
                    $info[self::SLEEP_START] = $net_info[self::SLEEP_START] ??
                        "00:00";
                    $info[self::SLEEP_DURATION] =
                        $net_info[self::SLEEP_DURATION] ?? "-1";
                } else {
                    $info[self::CRAWL_TIME] = 0;
                    $info[self::SLEEP_START] = $net_info[self::SLEEP_START] ??
                        "00:00";
                    $info[self::SLEEP_DURATION] =
                        $net_info[self::SLEEP_DURATION] ?? "-1";
                }
            } else {
                $info[self::CRAWL_TIME] = $crawl_status['CRAWL_TIME'];
                $info[self::SLEEP_START] = $crawl_status["SLEEP_START"] ??
                    "00:00";
                $info[self::SLEEP_DURATION] =
                    $crawl_status["SLEEP_DURATION"] ?? "-1";
            }
            if ($change == true) {
                file_put_contents($this->crawl_status_file_name,
                    serialize($crawl_status), LOCK_EX);
            }
        } else {
            $info[self::CRAWL_TIME] = 0;
        }
        $info[self::MEMORY_USAGE] = memory_get_peak_usage();
        $data = [];
        $data['MESSAGE'] = serialize($info);
        $this->displayView($view, $data);
    }
    /**
     * After robot, schedule, and index data have been uploaded and reassembled
     * as one big data file/string, this function splits that string into
     * each of these data types and then save the result into the appropriate
     * schedule sub-folder of the $messages_bundle. Any temporary files used
     * during uploading are then deleted.
     *
     * @param string $filename name of temp file used to upload big string.
     * @param MessagesBundle $messages_bundle used to store the messages
     *  contained in the file $filename so that they can be later processed
     * @return string $logging diagnostic info to be sent to fetcher about
     *     what was done
     */
    public function handleUploadedData($byte_counts, &$uploaded,
        $messages_bundle)
    {
        $logging = "... Data upload complete";
        $address = strtr(L\remoteAddress(), ["." => "-", ":" => "_"]);
        $time = time();
        $day = floor($time/C\ONE_DAY);
        $robot_data = "";
        $cache_page_validation_data = "";
        $schedule_data = "";
        $index_data = "";
        if (isset($byte_counts["TOTAL"]) &&
            $byte_counts["TOTAL"] > 0) {
            $pos = 0;
            $robot_data = substr($uploaded, $pos, $byte_counts["ROBOT"]);
            $pos += $byte_counts["ROBOT"];
            $cache_page_validation_data = substr($uploaded, $pos,
                $byte_counts["CACHE_PAGE_VALIDATION"]);
            $pos += $byte_counts["CACHE_PAGE_VALIDATION"];
            $schedule_data =
                substr($uploaded, $pos, $byte_counts["SCHEDULE"]);
            $pos += $byte_counts["SCHEDULE"];
            $index_data =
                substr($uploaded, $pos);
        }
        $added_data = false;
        if (strlen($robot_data) > 0) {
            $messages_bundle->addMessages(MessagesBundle::ROBOTS_FOLDER,
                $robot_data);
            $added_data = true;
        }
        if (C\USE_ETAG_EXPIRES && strlen($cache_page_validation_data) > 0) {
            $messages_bundle->addMessages(MessagesBundle::ETAG_EXPIRES_FOLDER,
                $cache_page_validation_data);
            $added_data = true;
        }
        if (strlen($schedule_data) > 0) {
            $messages_bundle->addMessages(MessagesBundle::SCHEDULES_FOLDER,
                $schedule_data);
            $added_data = true;
        }
        if (strlen($index_data) > 0) {
            $messages_bundle->addMessages(MessagesBundle::INDEX_DATA_FOLDER,
                $index_data);
            $added_data = true;
        }
        if ($added_data) {
            /* Time on $this->crawl_status_file_name used to determine if crawl
               stalled. We just uploaded data so not stalled
             */
            touch($this->crawl_status_file_name, time());
        }
        return $logging;
    }
    /**
     * Checks for the crawl time according either to crawl_status.txt or to
     * NetworkStatus.txt, and presents it to the requesting fetcher, along
     * with a list of available queue servers.
     */
    public function crawlTime()
    {
        $info = [];
        $info[self::STATUS] = self::CONTINUE_STATE;
        $view = "fetch";
        $cron_model = $this->model("cron");
        $channel = $this->getChannel();
        if (isset($_REQUEST['crawl_time'])) {;
            $prev_crawl_time = substr(
                $this->clean($_REQUEST['crawl_time'], 'int'), 0,
                C\TIMESTAMP_LEN);
        } else {
            $prev_crawl_time = 0;
        }
        $cron_time = $cron_model->getCronTime("fetcher_restart");
        $delta = time() - $cron_time;
        if ($delta > self::CRON_INTERVAL) {
            $cron_model->updateCronTime("fetcher_restart");
            $this->doFetcherCronTasks();
        } else if ($delta == 0) {
            $cron_model->updateCronTime("fetcher_restart");
        }
        $network_filename = C\SCHEDULES_DIR .
            "/{$channel}-" . self::network_status_file;
        if (file_exists($this->crawl_status_file_name)) {
            $crawl_status = unserialize(file_get_contents(
                $this->crawl_status_file_name));
            $crawl_time = (isset($crawl_status["CRAWL_TIME"])) ?
                $crawl_status["CRAWL_TIME"] : 0;
            $sleep_start = $crawl_status["SLEEP_START"] ?? "00:00";
            $sleep_duration = $crawl_status["SLEEP_DURATION"] ?? "-1";
        } else if (file_exists($network_filename)) {
            $net_status = unserialize(file_get_contents($network_filename));
            $crawl_time = $net_status[self::CRAWL_TIME] ?? 0;
            $sleep_start = $net_status[self::SLEEP_START] ?? "00:00";
            $sleep_duration = $net_status[self::SLEEP_DURATION] ?? "-1";
        } else {
            $crawl_time = 0;
            $sleep_start = "00:00";
            $sleep_duration = "-1";
        }
        $info[self::CRAWL_TIME] = $crawl_time;
        $info[self::SLEEP_START] = $sleep_start;
        $info[self::SLEEP_DURATION] = $sleep_duration;
        $status_filename = CrawlDaemon::getMessageFileName("NameServer",
            $channel);
        if ($crawl_time != 0 && file_exists($status_filename)) {
            $status = unserialize(file_get_contents($status_filename));
            if ($status[self::STATUS] == 'STOP_CRAWL') {
                $info[self::STATUS] == 'STOP_CRAWL';
                $info[self::CRAWL_TIME] = 0;
            } else {
                /* this is supposed to slow down fetchers
                   if the indexer has a lot to still index
                   $mult_factor * C\MINIMUM_FETCH_LOOP_TIME
                   gets bigger up to a maximum of C\PROCESS_TIMEOUT/2
                  C\PROCESS_TIMEOUT also determines if Yioop
                  thinks the indexes is dead, so we don't what the
                  MINIMUM_FETCH_LOOP_TIME to be too big. Note:
                  If the name server isn't being used for crawling,
                  the number of $mult_factors below will always be 1,
                  so this will always set the fetch loop time to
                  C\MINIMUM_FETCH_LOOP_TIME
                 */
                $tmp_base_dir = C\SCHEDULES_DIR . "/".
                    self::index_data_base_name . $crawl_time;
                $tmp_dirs = glob($tmp_base_dir . '/*', GLOB_ONLYDIR);
                $mult_factor = max(1, count($tmp_dirs));
                $info[self::MINIMUM_FETCH_LOOP_TIME] = max(min(
                    $mult_factor * C\MINIMUM_FETCH_LOOP_TIME,
                    C\PROCESS_TIMEOUT/2), C\MINIMUM_FETCH_LOOP_TIME);
            }
            if ($status[self::STATUS] != 'STOP_CRAWL'  &&
                $crawl_time != $prev_crawl_time) {
                $to_copy_fields = [self::ALLOWED_SITES, self::ARC_DIR,
                    self::ARC_TYPE, self::CRAWL_INDEX, self::CRAWL_TYPE,
                    self::DISALLOWED_SITES, self::INDEXED_FILE_TYPES,
                    self::PROXY_SERVERS, self::RESTRICT_SITES_BY_URL,
                    self::SUMMARIZER_OPTION, self::TOR_PROXY
                    ];
                foreach ($to_copy_fields as $field) {
                    if (isset($status[$field])) {
                        $info[$field] = $status[$field];
                    }
                }
                /*
                   When initiating a new crawl AND there are active
                   classifiers (an array of class labels), then augment the
                   info with compressed, serialized versions of each active
                   classifier so that each fetcher can reconstruct the same
                   classifiers.
                 */
                $classifier_array = [];
                if (isset($status[self::ACTIVE_CLASSIFIERS])) {
                    $classifier_array = array_merge(
                        $status[self::ACTIVE_CLASSIFIERS]);
                    $info[self::ACTIVE_CLASSIFIERS] =
                        $status[self::ACTIVE_CLASSIFIERS];
                }
                if (isset($status[self::ACTIVE_RANKERS])) {
                    $classifier_array = array_merge($classifier_array,
                        $status[self::ACTIVE_RANKERS]);
                    $info[self::ACTIVE_RANKERS] =
                        $status[self::ACTIVE_RANKERS];
                }
                if ($classifier_array != []) {
                    $classifiers_data = Classifier::loadClassifiersData(
                            $classifier_array);
                    $info[self::ACTIVE_CLASSIFIERS_DATA] = $classifiers_data;
                }
            }
        }
        $info[self::SCRAPERS] = base64_encode(
            serialize($this->model("scraper")->getAllScrapers()));
        $info[self::QUEUE_SERVERS] =
            $this->model("machine")->getQueueServerUrls(0, $channel);
        $info[self::SAVED_CRAWL_TIMES] = $this->getCrawlTimes();
        $info[self::POST_MAX_SIZE] = L\metricToInt(ini_get("post_max_size"));
        if (count($info[self::QUEUE_SERVERS]) == 0 && $channel == 0) {
            $info[self::QUEUE_SERVERS] = [C\NAME_SERVER];
        }
        $data = [];
        $data['MESSAGE'] = serialize($info);
        $this->displayView($view, $data);
    }
    /**
     * Used to do periodic maintenance tasks for the Name Server.
     * For now, just checks if any fetchers which the user turned on
     * have crashed and if so restarts them
     */
    public function doFetcherCronTasks()
    {
        $this->model("machine")->restartCrashedFetchers();
    }
    /**
     * Gets a list of all the timestamps of previously stored crawls
     *
     * This could probably be moved to crawl model. It is a little lighter
     * than getCrawlList and should be only used with a name server so leaving
     * it here so it won't be confused.
     *
     * @return array list of timestamps
     */
    public function getCrawlTimes()
    {
        $list = [];
        $dirs = glob(C\CACHE_DIR . '/*');
        foreach ($dirs as $dir) {
            if (strlen($pre_timestamp = strstr($dir,
                self::index_data_base_name)) > 0) {
                $list[] = substr($pre_timestamp,
                    strlen(self::index_data_base_name));
            }
            if (strlen($pre_timestamp = strstr($dir,
                self::network_base_name)) > 0) {
                $tmp = substr($pre_timestamp,
                    strlen(self::network_base_name), -4);
                if (is_numeric($tmp)) {
                    $list[] = $tmp;
                }
            }
        }
        $list = array_unique($list);
        return $list;
    }
}
ViewGit