Fix a crash problem in QueueServer caused by array access on false in robots handling, a=chris

Chris Pollett [2021-09-18 05:Sep:th]

Fix a crash problem in QueueServer caused by array access on false in robots handling, a=chris

Filename
src/executables/QueueServer.php
src/library/CrawlDaemon.php
src/library/IndexDocumentBundle.php

diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 1ce78c81b..6e0019de2 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -1925,7 +1925,7 @@ class QueueServer implements CrawlConstants
         $sites = unserialize(gzuncompress(
             L\webdecode(file_get_contents($file))));
         L\crawlLog("Scheduler done decompressing robot file");
-        if (isset($sites)) {
+        if (!empty($sites)) {
             $num_sites = count($sites);
             $i = 0;
             foreach ($sites as $robot_host => $robot_info) {
diff --git a/src/library/CrawlDaemon.php b/src/library/CrawlDaemon.php
index 231c7678e..8186c9a51 100644
--- a/src/library/CrawlDaemon.php
+++ b/src/library/CrawlDaemon.php
@@ -67,7 +67,7 @@ class CrawlDaemon implements CrawlConstants
      */
     public static $mode;
     /**
-     * Tick callback function used to update the timestamp in this processes
+     * Callback function used to update the timestamp in this processes
      * lock. If lock_file does not exist or more than PROCESS_TIMEOUT
      * time has elapsed since the last processHandler call it stops the process
      *
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index bfd0bd668..307733e17 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -100,7 +100,10 @@ class IndexDocumentBundle implements CrawlConstants
      */
     const DOCUMENTS_FOLDER = "documents";
     /**
-     *
+     * Name of the last entries file used to help compute difference lists
+     * for doc_index, and position list offsets used in postings for the
+     * partition. This file is also used to track the total number of
+     * occurences of term in a partition
      */
     const LAST_ENTRIES_FILENAME = "last_entries";
     /**
@@ -114,7 +117,8 @@ class IndexDocumentBundle implements CrawlConstants
         self::LAST_ENTRIES_FILENAME, self::POSITIONS_FILENAME,
         self::POSTINGS_FILENAME];
     /**
-     *
+     * Name of the file within a paritions positions_doc_maps folder used
+     * to contain the partitions position list for each term.
      */
     const POSITIONS_FILENAME = "positions";
     /**
@@ -122,7 +126,12 @@ class IndexDocumentBundle implements CrawlConstants
      */
     const POSTINGS_FILENAME = "postings";
     /**
-     *
+     * Name of the folder used to hold position lists and document maps. Within
+     * this folder there is a subfolder for each partition which contains a
+     * doc_map file, postings file for the docs within the partition,
+     * position lists file for those postings, and a last_entries file
+     * used in the computation of difference list for doc_index and position
+     * list offsets, as well as number of occurences of terms.
      */
     const POSITIONS_DOC_MAP_FOLDER = "positions_doc_maps";
     /**
@@ -154,15 +163,17 @@ class IndexDocumentBundle implements CrawlConstants
      */
     public $dictionary;
     /**
-     * Makes or initializes an IndexArchiveBundle with the provided parameters
+     * Makes or initializes an IndexDocumentBundle with the provided parameters
      *
      * @param string $dir_name folder name to store this bundle
      * @param bool $read_only_archive whether to open archive only for reading
      *  or reading and writing
      * @param string $description a text name/serialized info about this
-     *      IndexArchiveBundle
-     * @param int $num_docs_per_partition the number of pages to be stored
-     *      in a single shard
+     *  IndexDocumentBundle
+     * @param int $num_docs_per_partition the number of documents to be stored
+     *  in a single partition
+     * @param int $max_keys the maximum number of keys used by the BPlusTree
+     *  used for the inverted index
      */
     public function __construct($dir_name, $read_only_archive = true,
         $description = null, $num_docs_per_partition =
@@ -248,7 +259,6 @@ class IndexDocumentBundle implements CrawlConstants
     }
     /**
      *
-     * @param int $add_num_docs number of docs in the shard about to be added
      * @param string $taking_too_long_touch a filename of a file to touch
      *  so its last modified time becomes the current time. In a typical
      *  Yioop crawl this is done for the crawl_status.txt file to prevent

ViewGit