Last commit for src/library/CrawlQueueBundle.php: afd6930f42e31d81a53d42061b5fd758f56c62de

First pass at modifying Yioop to again use Logarithmic Merge Trees for its dictionary structures

Chris Pollett [2024-01-15 02:Jan:th]
First pass at modifying Yioop to again use Logarithmic Merge Trees for its dictionary structures
Folder structure of IndexDocumentBundles also modified and now supports overflow folder (which
could be on a different hard drive). ArcTool has been updated to support migration to new
indexes
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\library;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library\compressors\NonCompressor;

/**
 * Used for the crawlHash function
 */
require_once __DIR__ . '/Utility.php';
/**
 * Encapsulates the data structures needed to have a queue of to crawl urls
 *
 * @author Chris Pollett
 */
class CrawlQueueBundle
{
    /**
     * The folder name of this CrawlQueueBundle
     * @var string
     */
    public $dir_name;
    /**
     * number of entries the priority queue used by this web queue bundle
     * can store
     * @var int
     */
    public $num_urls_ram;
    /**
     * Number items that can be stored in a partition of the page exists filter
     * bundle
     * @var int
     */
    public $filter_size;
    /**
     * Array of hosts for which a robots.txt file has just been received and
     * processed for which urls from that host are still waiting to be notified
     * for queueing.
     * @var array
     */
    public $robot_notify_hosts;
    /**
     * LinearHashTable of information about company level domains that have
     * been crawled. Information includes number of SEEN_URLS, number of
     * WEIGHTED_SEEN_URLS, number of WEIGHTED_INCOMING_URLS.
     * (A company level domain is google.com or google.co.uk, but not
     *  fo.la.google.com, www.google.com, foo.google.com or foo.google.co.uk)
     * @var LinearHashTable
     */
    public $domain_table;
    /**
     * host-ip table used for dns look-up, comes from robot.txt data and
     * deleted with same frequency
     * @var object
     */
    public $dns_table;
    /**
     * RAM cache of recent robot.txt stuff crawlHash(host) => robot.txt info
     * @var array
     */
    public $robot_cache = [];
    /**
     * Time when cache of recent robot.txt for host done
     * crawlHash(host) => timestamp
     * @var array
     */
    public $robot_cache_times = [];
    /**
     * BloomFilter used to keep track of which urls we've already seen
     * @var object
     */
    public $url_exists_filter_bundle;
    /**
     * HashTable used to store offsets into WebArchive that stores robot paths
     * @var LinearHashTable
     */
    public $robot_table;
    /**
     * Holds  etag and expires http data
     * @var LinearHashTable
     */
    public $etag_table;
    /**
     * Number of bytes in for hash table key
     */
    const HASH_KEY_SIZE = 8;
    /**
     * Length of an IPv6 ip address (IPv4 address are padded)
     */
    const IP_SIZE = 16;
    /**
     * Url type flag
     */
     const NO_FLAGS = 0;
    /**
     * Url type flag
     */
     const WAITING_HOST = 1;
    /** Size of int
     */
    const INT_SIZE = 4;
    /** Size of notify buffer
     */
    const MAX_URL_FILE_SIZE = 1000000;
    /**
     * When writing urls to robot_table, how many to buffer at a time and
     * then bulk put.
     */
    const MAX_URL_BUFFER_BEFORE_WRITE = 500;
    /**
     * File extension to used for files of serialized url data
     */
    const URL_FILES_EXTENSION = ".txt.gz";
    /**
     * Number of bytes in for hash table key
     */
    const CRAWL_DELAYED_FOLDER = "CrawlDelayedHosts";
    /**
     * Number of bytes in for hash table key
     */
    const URL_QUEUE_FOLDER = "UrlQueue";
    /**
     * Number of bytes in for hash table key
     */
    const ROBOT_WAIT_FOLDER = "WaitRobotUrls";
    /**
     * Makes a CrawlQueueBundle with the provided parameters
     *
     * @param string $dir_name folder name used by this CrawlQueueBundle
     * @param int $filter_size size of each partition in the page exists
     *     BloomFilterBundle
     * @param int $num_urls_ram number of entries in ram for the priority queue
     */
    public function __construct($dir_name,
        $filter_size, $num_urls_ram)
    {
        $this->dir_name = $dir_name;
        $this->filter_size = $filter_size;
        $this->num_urls_ram = $num_urls_ram;
        $this->robot_notify_hosts = [];
        if (!file_exists($this->dir_name)) {
            mkdir($this->dir_name);
        }
        if (!file_exists($this->dir_name . "/" . self::CRAWL_DELAYED_FOLDER)) {
            mkdir($this->dir_name . "/" . self::CRAWL_DELAYED_FOLDER);
        }
        if (!file_exists($this->dir_name . "/" . self::ROBOT_WAIT_FOLDER)) {
            mkdir($this->dir_name . "/" . self::ROBOT_WAIT_FOLDER);
        }
        if (!file_exists($this->dir_name . "/" . self::URL_QUEUE_FOLDER)) {
            mkdir($this->dir_name . "/" . self::URL_QUEUE_FOLDER);
        }
        /* Hash table containing DNS cache this is cleared whenever robot
           filters cleared
         */
        if (file_exists($dir_name . "/dns_table.dat")) {
            $this->dns_table = HashTable::load($dir_name . "/dns_table.dat");
        } else {
            $this->dns_table = new HashTable($dir_name . "/dns_table.dat",
                4 * $num_urls_ram, self::HASH_KEY_SIZE, self::IP_SIZE);
        }
        //filter bundle to check if we have already visited a URL
        $this->url_exists_filter_bundle = new BloomFilterBundle(
            $dir_name . "/UrlExistsFilterBundle", $filter_size);
        // set-up table for each host to keep track of number of urls downloaded
        $this->domain_table = new LinearHashTable($dir_name .
            "/CLDData", ["PRIMARY KEY" => "COMPANY_LEVEL_DOMAIN",
            "SEEN_URLS" => "INT", "WEIGHTED_SEEN_URLS" => "DOUBLE",
            "WEIGHTED_INCOMING_URLS" => "DOUBLE"]);
        //set up storage for robots.txt info
        $this->robot_table = new LinearHashTable($dir_name .
            "/RobotData", ["PRIMARY KEY" => "HOSTNAME",
            "CAPTURE_TIME" => "INT", "CRAWL_DELAY" => "INT",
            "ROBOT_PATHS" => "SERIAL", "FLAGS" => "INT"],
            LinearHashTable::MAX_ITEMS_PER_FILE,
            LinearHashTable::PARTITION_SIZE_THRESHOLD,
            C\NS_COMPRESSORS . "GzipCompressor");
        //Initialize table for cache page validation data
        $this->etag_table = new LinearHashTable($dir_name .
            '/EtagExpiresInfo', ["PRIMARY KEY" => "URL",
            "ETAG" => "TEXT", "EXPIRES" => "INT"]);
        $this->notify_buffer = [];
    }
    //Filter and Filter Bundle Methods
    /**
     * Adds the supplied url to the url_exists_filter_bundle
     * @param string $url url to add
     */
    public function addSeenUrlFilter($url)
    {
        $this->url_exists_filter_bundle->add($url);
    }
    /**
     * Removes all url objects from $url_array which have been seen
     * @param array &$url_array objects to check if have been seen
     * @param array $field_names an array of components of a url_array element
     * which contain a url to check if seen. If null, assumes url_array
     * is just and array of urls not an array of url infos (i.e., array of
     * array), and just directly checks those strings
     */
    public function differenceSeenUrls(&$url_array, $field_names = null)
    {
        $this->url_exists_filter_bundle->differenceFilter(
            $url_array, $field_names);
    }
    /**
     * Returns the timestamp of the last time host's robots.txt file was
     * downloaded
     * @param string $host url to check
     * @return int|bool returns false if no capture of robots.txt yet,
     *  otherwise returns an integer timestamp
     */
    public function gotRobotTxtTime($host)
    {
        $row = $this->robot_table->get($host, ["CAPTURE_TIME"]);
        if ($row !== false) {
            return $row["CAPTURE_TIME"];
        }
        return false;
    }
    /**
     * Adds an array of url tuples to the queue of urls of waiting for
     * robots.txt files to be received. This queue consists of a folder
     * CrawlQueueBundle::ROBOT_WAIT_FOLDER with subfolders the hash of the
     * name of a host that doesn't have a robots.txt file received yet.
     * $url_tuple are then sorted to the appropriate host subfolder and are
     * stored in subfolders by the day recieved and then a file in a sequence
     * files according to order received. Each file in the sequence is able
     * to store 1MB compressed many url tuples.
     *
     * @param array $url_tuples array of tuples of the form
     *  (url, weight, referer)
     */
    public function addWaitRobotQueue($url_tuples)
    {
        $robot_wait_dir = $this->dir_name . "/" . self::ROBOT_WAIT_FOLDER;
        //Group by host
        $tuples_by_host = [];
        foreach ($url_tuples as $url_tuple) {
            list($url, ) = $url_tuple;
            $host = UrlParser::getHost($url);
            $hash_host = crawlHash($host);
            $tuples_by_host[$hash_host] ??= [];
            $tuples_by_host[$hash_host][] = $url_tuple;
        }
        foreach($tuples_by_host as $hash_host => $host_tuples) {
            $hash_host_dir = $robot_wait_dir . "/$hash_host";
            $this->addUrlsDirectory($hash_host_dir, $host_tuples);
        }
    }
    /**
     * Adds an array of url tuples to the queue of urls about to be scheduled
     * into fetches batches to be downloaded by fetchers. This queue consists
     * of tiers. Url tuples are sorted into a tier based on the number of
     * urls that have been downloaded for that url's host and their weight.
     * Naively, without weight, a url goes into tier
     * floor(log(# of urls downloaded already for its host))
     * Within a tier, urls are stored in folders by day recieved and then into
     * a file from a sequence of files according to order received. Each file
     * in the sequence is able to store 1MB compressed many url tuples.
     *
     * @param array $url_tuples array of tuples of the form
     *  (url, weight, referer)
     * @param string $crawl_order one of CrawlConstants::BREADTH_FIRST or
     *  CrawlConstants::HOST_BUDGETING
     */
    public function addSendFetcherQueue($url_tuples, $crawl_order)
    {
        $url_queue_folder = $this->dir_name . "/" . self::URL_QUEUE_FOLDER;
        //Group by host
        $seen_clds = [];
        $out_queues = [];
        foreach ($url_tuples as $url_tuple) {
            list($url, ) = $url_tuple;
            $cld = UrlParser::getCompanyLevelDomain($url);
            if (isset($seen_clds[$cld])) {
                $cld_data = $seen_clds[$cld];
            } else {
                $cld_data = $this->domain_table->get($cld);
            }
            if (empty($cld_data)) {
                $cld_data = ['COMPANY_LEVEL_DOMAIN' => $cld, 'SEEN_URLS' => 0,
                'WEIGHTED_SEEN_URLS' => 0.0, 'WEIGHTED_INCOMING_URLS' => 0.0];
            }
            $cld_data = $this->updateCompanyLevelDomainData($url_tuple,
                $cld_data, $crawl_order);
            $tier = $this->computeTierUrl($url_tuple,
                $cld_data, $crawl_order);
            $out_queues[$tier] ??= [];
            $out_queues[$tier][] = $url_tuple;
            $seen_clds[$cld] = $cld_data;
        }
        foreach ($seen_clds as $cld => $cld_data) {
            $this->domain_table->put($cld_data);
        }
        foreach ($out_queues as $tier => $url_tuples) {
            $out_folder = "$url_queue_folder/Tier$tier";
            $this->addUrlsDirectory($out_folder, $url_tuples);
        }
    }
    /**
     * Computes an update to the company level domain data provided in
     * cld_data, updating the WEIGHTED_SEEN_URLS and WEIGHTED_INCOMING_URLS
     * fields according to information about a discovered url in $url_tuple
     *
     * @param array $url_tuple  $url_tuple 5-tuple contains a url, its weight,
     * the depth in the crawl where it was found, the url that refered to it,
     * and thaturl's weight
     * @param array $cld_data company level domain data to update
     * @param string $crawl_order one of CrawlConstants::BREADTH_FIRST or
     *  CrawlConstants::HOST_BUDGETING
     * @return int tier $url should be queue into
     */
    public function updateCompanyLevelDomainData($url_tuple, $cld_data,
        $crawl_order)
    {
        list($url, $weight, $depth, $linking_url, $linking_weight) = $url_tuple;
        $cld_data['SEEN_URLS']++;
        if ($crawl_order == CrawlConstants::BREADTH_FIRST) {
            return $cld_data;
        }
        $cld_data['WEIGHTED_SEEN_URLS'] += min(1, 1 + log(1 + $weight, 5));
        $linking_cld = UrlParser::getCompanyLevelDomain($linking_url);
        $linking_cld_data = $this->domain_table->get($linking_cld);
        if (!empty($linking_cld_data)) {
            $linking_url_tier = floor(log10(min(1,
                $linking_cld_data['WEIGHTED_SEEN_URLS'] -
                $linking_cld_data['WEIGHTED_INCOMING_URLS'])));
            if ($cld_data['COMPANY_LEVEL_DOMAIN'] !=
                $linking_cld_data['COMPANY_LEVEL_DOMAIN']) {
                $cld_data['WEIGHTED_INCOMING_URLS'] += 1/(
                    (1.1 + $linking_url_tier + log(1 + $linking_weight, 5)));
            }
        }
        return $cld_data;
    }
    /**
     * Used to compute which send-fetcher-queue tier a url should be added
     * to based, on the data related to the url in $url_tuple,
     * its company level domain data, and the crawl order being used
     *
     * @param array $url_tuple 5-tuple contains a url, its weight, the depth
     *  in the crawl where it was found, the url that refered to it, and that
     *  url's weight
     * @param array $cld_data
     * @param string $crawl_order one of CrawlConstants::BREADTH_FIRST or
     *  CrawlConstants::HOST_BUDGETING
     * @return int tier $url should be queue into
     */
    public function computeTierUrl($url_tuple, $cld_data, $crawl_order)
    {
        list($url, $weight, $depth, ) = $url_tuple;
        if ($crawl_order == CrawlConstants::BREADTH_FIRST) {
            return $depth;
        }
        $tier = floor(log10(max(1, $cld_data['WEIGHTED_SEEN_URLS'] -
            $cld_data['WEIGHTED_INCOMING_URLS'])));
        $robots_txt = "robots.txt";
        // put sitemaps in a higher queue
        if (in_array(substr($url, -3), [".gz", ".bz", "xml"])) {
            $tier += C\SITEMAP_TIER_PENALTY;
        } else if (UrlParser::guessMimeTypeFromFileName($url) != "text/html"
            && substr($url, -strlen($robots_txt)) != $robots_txt) {
            //slightly penalize non html documents
            $tier++;
        }
        if (C\nsdefined('VERBOSE_LOGGING') && C\VERBOSE_LOGGING) {
            crawlLog("Computed tier $tier for $url based on seen score " .
                $cld_data['WEIGHTED_SEEN_URLS'] .
                " and incoming score " . $cld_data['WEIGHTED_INCOMING_URLS']);
        }
        return $tier;
    }
    /**
     * This method is used to send urls that are in the waiting hosts folder
     * for hosts listed in $this->robot_notify_hosts
     * to be received to be moved to the queue because host membership in
     * $this->robot_notify_hosts indicates that a robots.txt
     * file has just been received for the particular domain.
     * @param string $crawl_order one of CrawlConstants::BREADTH_FIRST or
     *  CrawlConstants::HOST_BUDGETING
     */
    public function processReceivedRobotTxtUrls($crawl_order)
    {
        $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager";
        $db = new $db_class();
        $robot_wait_dir = $this->dir_name . "/" . self::ROBOT_WAIT_FOLDER;
        $robot_notify_hosts = $this->robot_notify_hosts;
        $len_dot_txt_gz = strlen(self::URL_FILES_EXTENSION);
        foreach ($robot_notify_hosts as $host) {
            $hash_host = crawlHash($host);
            $host_subfolder = "$robot_wait_dir/$hash_host";
            if (!file_exists($host_subfolder)) {
                continue;
            }
            $day_folders = $this->getDayFolders($host_subfolder);
            if (empty($day_folders)) {
                continue;
            }
            foreach ($day_folders as $day_folder) {
                $url_files = $this->getUrlsFiles($day_folder);
                if (empty($url_files)) {
                    continue;
                }
                foreach ($url_files as $url_file) {
                    $url_info = $this->getUrlsFileContents($url_file);
                    if (is_array($url_info)) {
                        $this->addSendFetcherQueue($url_info,
                            $crawl_order);
                    }
                    unlink($url_file);
                }
            }
            $db->unlinkRecursive("$robot_wait_dir/$hash_host", true);
        }
        $this->robot_notify_hosts = [];
    }
    /**
     * Returns an array of all the days folders for a crawl queue.
     * By design queues in a CrawlQueueBundle consist of a sequence of
     * subfolders with day timestamps (floor(unixstamp/86400)), and then
     * files within these folders. This function returns a list of the
     * day folder paths for such a queue.
     * Note this function assumes that there aren't too many days to exceed
     * memory. If a crawl runs at most a few years, this should be the case
     * @param string $dir folder qhich is acting as a CrawlQueueBundle queue
     * @return array of paths to day folders
     */
    public function getDayFolders($dir)
    {
        $digit = "[0123456789]";
        $folders = glob("$dir/$digit*", GLOB_ONLYDIR);
        return $folders;
    }
    /**
     * Returns an array of all the url info files in a queue subfolder of
     * a queue for a CrawlQueueBundle. Url info files are usually stored
     * in a file with a nine digit number followed by the queues file
     * extension (usually .txt.gz) and store up to 1MB of compressed url info.
     * This function assumes the paths to the number of url info files in the
     * provided can fit in memory
     *
     * @param string $dir folder containing url info files
     * @return array of paths to each url info file found.
     */
    public function getUrlsFiles($dir)
    {
        $digit = "[0123456789]";
        $files = glob("$dir/$digit*" . self::URL_FILES_EXTENSION);
        return $files;
    }
    /**
     * Returns the unserialized contents of a url info file after decompression.
     * Assumes the resulting structure is small enough to fit in memory
     *
     * @param string $file_name name of url info file
     * @return array of uncompressed, unserialized contents of this file.
     */
    public function getUrlsFileContents($file_name)
    {
        $contents = gzdecode(file_get_contents($file_name));
        return unserialize($contents);
    }
    /**
     * Serializes and compress the url info (such as url tuples (url, weight,
     * referer)) provided in $url_data and save the results into $file_name
     * @param string $file_name name of file to store unrl info into
     * @param array $url_data data to be serialized, compressed, and stored.
     */
    public function putUrlsFileContents($file_name, $url_data)
    {
        $contents = gzencode(serialize($url_data));
        return file_put_contents($file_name, $contents);
    }
    /**
     * Adds the url info (such as url tuples (url, weight, referer)) to
     * the appropriate file in a subfolder of the folder $dir used to
     * implement a CrawlBundleQueue. If $timestamp is 0, then will store
     * data in $dir/current day time stamp/last_file_in_folder.txt.gz . If
     * last file exceed 1MB a new last file is started. If $timestamp > 0
     * then data is stored in
     * $dir/$timestamp's day time stamp/$timestamp.txt.gz
     *
     * @param string $dir folder to store data into a subfolder of
     * @param array $url_info information to serialized, compress, and store
     * @param int $timestamp to use during storage to determine path as
     *   described above
     */
    public function addUrlsDirectory($dir, $url_info, $timestamp = 0)
    {
        if (!file_exists($dir)) {
            mkdir($dir);
            chmod($dir, 0777);
        }
        $time = ($timestamp == 0) ? time() : $timestamp;
        $day = floor($time/C\ONE_DAY);
        $dir .= "/$day";
        if (!file_exists($dir)) {
            mkdir($dir);
            chmod($dir, 0777);
        }
        $out_file_data = [];
        if ($timestamp > 0) {
            $out_file = "$dir/" . sprintf("%'.09d", $timestamp).
                self::URL_FILES_EXTENSION;
            if (file_exists($out_file)) {
                $out_file_data = unserialize(gzdecode(
                    file_get_contents($out_file)));
                if (!is_array($out_file_data)) {
                    $out_file_data = [];
                }
            }
        } else {
            $count_file = "$dir/count.txt";
            //using numbers rather than timestamps prevents race conditions
            $original_file_num = 1;
            if (file_exists($count_file)) {
                $original_file_num = max(intval(file_get_contents(
                    $count_file)), 1);
            }
            $file_num = $original_file_num;
            $out_file = "$dir/" . sprintf("%'.09d", $file_num) .
                self::URL_FILES_EXTENSION;
            while (file_exists($out_file)) {
                $out_file_size = filesize($out_file);
                if ($out_file_size < self::MAX_URL_FILE_SIZE) {
                    $out_file_data = unserialize(gzdecode(
                        file_get_contents($out_file)));
                    if (!is_array($out_file_data)) {
                        $out_file_data = [];
                    }
                    break;
                }
                $file_num++;
                $out_file = "$dir/" . sprintf("%'.09d", $file_num) .
                    self::URL_FILES_EXTENSION;
            }
        }
        $out_file_data = array_merge($out_file_data,  $url_info);
        $this->putUrlsFileContents($out_file, $out_file_data);
    }
    /**
     * Returns the path to the send-fetcher-queue tier to use to make the next
     * fetch batch of urls to download.
     * @param string $crawl_order one of CrawlConstants::BREADTH_FIRST or
     *  CrawlConstants::HOST_BUDGETING
     *
     * @return string path to send-fetcher-queue tier
     */
    public function chooseFetchBatchQueueFolder($crawl_order)
    {
        static $last_folder = 0;
        static $exp_max_folder = 1;
        $url_queue_folder = $this->dir_name . "/" . self::URL_QUEUE_FOLDER;
        $sub_dirs = glob("$url_queue_folder/*", GLOB_ONLYDIR);
        if (empty($sub_dirs)) {
            return false;
        }
        if ($crawl_order == CrawlConstants::BREADTH_FIRST) {
            $is_empty = true;
            foreach($sub_dirs as $sub_dir) {
                $day_folders = $this->getDayFolders($sub_dir);
                if (!empty($day_folders)) {
                    $is_empty = false;
                    break;
                }
            }
            return ($is_empty) ? false : $sub_dir;
        }
        /* the hope of the following is to prevent looking at sitemaps
           too early in the crawl before all the seed sites are donwloaded
         */
        $exp_max_folder++;
        $pre_max_folder = ceil(log($exp_max_folder, 2));
        if ($pre_max_folder >=  C\SITEMAP_TIER_PENALTY) {
            $pre_max_folder = count($sub_dirs);
        }
        $max_folder = min(count($sub_dirs), $pre_max_folder);
        $last_folder = ($last_folder < $max_folder - 1) ?
            $last_folder + 1 : 0;
        return $sub_dirs[$last_folder];
    }
    /**
     * For a timestamp $schedule_time of a fetch batch of urls to be downloaded
     * and for a list of crawl-delayed hosts in that batch, add the hosts to
     * a a $schedule time file in the CrawlDelayedHosts queue so they can be
     * notified when that fetch batch is done processing. Until notified any
     * url from one of these crawl delayed hosts will be rescheduled rather than
     * but in a fetch batch for download.
     *
     * @param int schedule_time
     * @param array $host_urls array of urls for hosts that are crawl delayed
     *    and for which there is a schedule currently running on fetchers
     *    which might download from that host
     */
    public function addCrawlDelayedHosts($schedule_time, $host_urls)
    {
        $crawl_delayed_folder =
            $this->dir_name . "/" . self::CRAWL_DELAYED_FOLDER;
        $this->addUrlsDirectory($crawl_delayed_folder, $host_urls,
            $schedule_time);
    }
    /**
     * For each host in the crawl-delayed hosts queue waiting on the
     * fetch batch schedule with $timestamp timestamp, clear their FLAGS
     * variable in the robot table so that urls with this host are allowed to
     * be scheduled into future fetch batches for download.
     *
     * @param int $timestamp of a fetch batch schedule to notify
     *    crawl-delayed hosts that it has completed download.
     */
    public function notifyCrawlDelayedHosts($timestamp)
    {
        crawlLog("Scheduler: Notifying hosts that were crawl delayed by ".
            "Schedule $timestamp");
        $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager";
        $db = new $db_class();
        $crawl_delayed_folder =
            $this->dir_name . "/" . self::CRAWL_DELAYED_FOLDER;
        $day_folders = $this->getDayFolders($crawl_delayed_folder);
        //maximum crawl delay will honor is one day
        $yesterday = floor((time() - C\ONE_DAY)/C\ONE_DAY);
        if(empty($day_folders)) {
            return; //no one is waiting
        }
        $robot_rows = [];
        foreach ($day_folders as $day_folder) {
            $day_timestamp = intval(substr($day_folder, -9));
            if ($day_timestamp >= $yesterday) {
                continue;
            }
            $waiting_host_files = $this->getUrlsFiles($day_folder);
            if (!empty($waiting_host_files)) {
                foreach ($waiting_host_files as $waiting_host_file) {
                    $robot_rows = $this->processWaitingHostFile(
                        $waiting_host_file, $robot_rows);
                }
            }
            $db->unlinkRecursive($day_folder);
        }
        $stamp_day = floor($timestamp/C\ONE_DAY);
        $file_name = "$crawl_delayed_folder/$stamp_day/".
            sprintf("%'.09d", $timestamp) . self::URL_FILES_EXTENSION;
        if (file_exists($file_name)) {
            $robot_rows = $this->processWaitingHostFile($file_name,
                $robot_rows);
            if (!empty($robot_rows)) {
                if (C\nsdefined('VERBOSE_LOGGING') && C\VERBOSE_LOGGING) {
                    crawlLog(
                        "Scheduler: Notifying the following list of hosts:");
                    $i = 0;
                    foreach ($robot_rows as $robot_row) {
                        $i++;
                        crawlLog("$i. ". $robot_row['HOSTNAME'] ?? "");
                    }
                }
                $this->robot_table->put($robot_rows);
            }
        }
    }
    /**
     * Used by @see notifyCrawlDelayedHosts($timestamp).
     * For each host listed in the file $file_name get its robot info from
     * robot_table, clear its FLAG column, store the update into
     * a temporary array $robot_rows. Every MAX_URL_BUFFER_BEFORE_WRITE
     * many such hosts, write the updates in $robot_rows back to the
     * robot_table on disk. If last batch of modified rows has been written
     * when done file, return these in $robot_rows
     *
     *  @param string $file_name to get hosts to clear flag columns of
     *  @param array $robot_rows rows of updated hosts potentially from a
     *      previously processed file.
     *  @return array $robot_rows leftover updated robot host rows that haven't
     *      been written to disk yet
     */
    public function processWaitingHostFile($file_name, $robot_rows)
    {
        $waiting_hosts = $this->getUrlsFileContents($file_name);
        if (empty($waiting_hosts)) {
            return [];
        }
        foreach ($waiting_hosts as $waiting_host) {
            $robot_data = $this->robot_table->get($waiting_host);
            $robot_data["FLAGS"] = 0;
            $robot_rows[] = $robot_data;
            if (count($robot_rows) > self::MAX_URL_BUFFER_BEFORE_WRITE) {
                crawlLog("Scheduler: Notifying the following list of hosts ".
                    print_r($robot_rows, true));
                $this->robot_table->put($robot_rows);
                $robot_rows = [];
            }
        }
        return $robot_rows;
    }
    /**
     * Checks if the given $url is allowed to be crawled based on stored
     * robots.txt info.
     *
     * @param string $url to check
     * @return bool whether it was allowed or not
     */
    public function checkRobotOkay($url)
    {
        list($host, $path) = UrlParser::getHostAndPath($url, true, true);
        $path = urldecode($path);
        $robot_data = $this->getRobotData($host);
        $robot_paths = $robot_data["ROBOT_PATHS"] ?? [];
            //these should have been urldecoded in RobotProcessor
        if (empty($robot_paths)) {
            $robots_okay = false;
            $robots_not_okay = true;
        } else {
            $robots_okay = true;
            $robots_not_okay = false;
        }
        if (!empty($robot_paths[CrawlConstants::DISALLOWED_SITES])) {
            $robots_not_okay = UrlParser::isPathMemberRegexPaths($path,
                $robot_paths[CrawlConstants::DISALLOWED_SITES]);
            $robots_okay = !$robots_not_okay;
        }
        if (!empty($robot_paths[CrawlConstants::ALLOWED_SITES])) {
            $robots_okay = UrlParser::isPathMemberRegexPaths($path,
                $robot_paths[CrawlConstants::ALLOWED_SITES]);
        }
        return $robots_okay || !$robots_not_okay;
    }
    /**
     * For a provided hostname, returns the robots.txt
     * information stored in the the robot table: [HOSTNAME,
     * CAPTURE_TIME, CRAWL_DELAY, ROBOT_PATHS => [ALLOWED_SITES,
     *  DISALLOWED_SITES], FLAGS (for not whether should wait for notification
     * from a schedule being downloaded before continuing crawling the site).
     *
     * @param string $host hostname to look up robots.tx info for.
     *    (no trailing / in hostname.  i.e., https:/www.yahoo.com, not
     *    https:/www.yahoo.com/)
     * @return array robot table row as described above
     */
    public function getRobotData($host)
    {
        $key = crawlHash($host, true);
        if (isset($this->robot_cache[$key])) {
            $robot_data = $this->robot_cache[$key];
            $this->robot_cache_times[$key] = microtime(true);
        } else {
            $robot_data = $this->robot_table->get($host);
            if (!empty($robot_data)) {
                $this->robot_cache[$key] = $robot_data;
                $cache_now = microtime(true);
                $this->robot_cache_times[$key] = $cache_now;
                if (count($this->robot_cache) > C\SIZE_ROBOT_TXT_CACHE) {
                    asort($this->robot_cache_times);
                    reset($this->robot_cache_times);
                    $evict_key = key($this->robot_cache_times);
                    unset($this->robot_cache_times[$evict_key],
                        $this->robot_cache[$evict_key]);
                }
            }
        }
        return $robot_data;
    }
    /**
     * Gets the timestamp of the oldest dns address still stored in
     * the queue bundle
     * @return int a Unix timestamp
     */
    public function getDnsAge()
    {
        $time = time();
        if (file_exists($this->dir_name . "/dns_timestamp.txt")) {
            $creation_time = intval(
                file_get_contents($this->dir_name . "/dns_timestamp.txt"));
        } else {
            $creation_time = $time;
        }
        return ($time - $creation_time);
    }
    /**
     * Add an entry to the web_queue_bundles DNS cache
     *
     * @param string $host hostname to add to DNS Lookup table
     * @param string $ip_address in presentation format (not as int) to add
     *     to table
     */
    public function addDNSCache($host, $ip_address)
    {
        $pad = "000000000000";
        $hash_host = crawlHash($host, true);
        $packed_ip = inet_pton($ip_address);
        if (strlen($packed_ip) == 4) {
            $packed_ip .= $pad;
        }
        $this->dns_table->insert($hash_host, $packed_ip);
    }
    /**
     * Used to lookup an entry in the DNS cache
     *
     * @param string $host hostname to add to DNS Lookup table
     * @return string ipv4 or ipv6 address written as a string
     */
    public function dnsLookup($host)
    {
        $pad = "000000000000";
        $hash_host = crawlHash($host, true);
        $packed_ip = $this->dns_table->lookup($hash_host);
        if (!$packed_ip) return false;
        $maybe_pad = substr($packed_ip, 4);
        $maybe_ip4 = substr($packed_ip, 0, 4);
        if (strcmp($maybe_pad, $pad) == 0) {
            $ip_address = inet_ntop($maybe_ip4);
        } else {
            $ip_address = inet_ntop($packed_ip);
        }
        if (strcmp($ip_address, "0.0.0.0") == 0) {
            return false;
        }
        return $ip_address;
    }
    /**
     * Gets the timestamp of the oldest url filter data still stored in
     * the queue bundle
     * @return int a Unix timestamp
     */
    public function getUrlFilterAge()
    {
        $creation_time = intval(
            file_get_contents($this->dir_name."/url_timestamp.txt"));
        return (time() - $creation_time);
    }
    /**
     * Delete the Hash table used to store DNS lookup info.
     * Then construct an empty new one.
     * This is called roughly once a day at the same time as
     * @see emptyRobotFilters()
     *
     * @return string $message with what happened during empty process
     */
    public function emptyDNSCache()
    {
        $num_values = $this->dns_table->num_values;
        if (file_exists($this->dir_name . "/dns_table.dat") ) {
            unlink($this->dir_name . "/dns_table.dat");
        }
        $this->dns_table = null;
        garbageCollect();
        $this->dns_table = new HashTable($this->dir_name . "/dns_table.dat",
            $num_values, self::HASH_KEY_SIZE, self::IP_SIZE);
        if ($this->dns_table) {
            $message = "Robot Emptier: dns_table empty now ".
                "and not null\n";
        } else {
            $message = "Robot Emptier: dns_table could not be ".
                "reinitialized\n";
        }
        return $message;
    }
    /**
     * Empty the crawled url filter for this web queue bundle; resets the
     * the timestamp of the last time this filter was emptied.
     */
    public function emptyUrlFilter()
    {
        file_put_contents($this->dir_name."/url_timestamp.txt", time());
        $this->url_exists_filter_bundle->reset();
    }
}
ViewGit