Tries to fix several minor issuses with queueserver aux file names, message race conditions and start stop semantics, a=chris

Chris Pollett [2023-09-08 23:Sep:th]

Tries to fix several minor issuses with queueserver aux file names, message race conditions and start stop semantics, a=chris

Filename
src/configs/Config.php
src/controllers/FetchController.php
src/controllers/MachineController.php
src/executables/Mirror.php
src/executables/QueueServer.php
src/library/CrawlConstants.php
src/library/CrawlDaemon.php
src/library/DoubleIndexBundle.php
src/library/FeedDocumentBundle.php
src/library/FetchUrl.php
src/library/IndexDocumentBundle.php
src/models/CrawlModel.php

diff --git a/src/configs/Config.php b/src/configs/Config.php
index e583469ee..bd880e6cd 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -992,8 +992,9 @@ nsconddefine('CRASH_LOG_NAME', LOG_DIR . "/YioopCrashes.log");
  */
 nsconddefine('PROCESS_TIMEOUT', 15 * ONE_MINUTE);
 /** Number of seconds of no fetcher contact before crawl is deemed dead
- *  The files C\SCHEDULES_DIR . "/{$this->channel}-CrawlStatus.txt"
- *  is used to determine if CRAWL_TIMEOUT reached.
+ *  The files C\SCHEDULES_DIR . "/{$this->channel}-" .
+ *  L\CrawlConstants::crawl_status_file
+ *  are used to determine if CRAWL_TIMEOUT reached.
  *  This is modified by QueueServer::writeAdminMessages only when
  *  the crawl state (waiting/start crawl/ shutdown, etc) changes.
  *  It is also updated when a fetcher sends an update command to
diff --git a/src/controllers/FetchController.php b/src/controllers/FetchController.php
index b33e9cc9f..7c4445e52 100755
--- a/src/controllers/FetchController.php
+++ b/src/controllers/FetchController.php
@@ -59,7 +59,7 @@ class FetchController extends Controller implements CrawlConstants
      * @var string
      */
     public $crawl_status_file_name =
-            C\SCHEDULES_DIR . "/0-CrawlStatus.txt";
+            C\SCHEDULES_DIR . "/0-" . self::crawl_status_file;
     /**
      * Number of seconds that must elapse after last call before doing
      * cron activities (mainly check liveness of fetchers which should be
@@ -91,7 +91,7 @@ class FetchController extends Controller implements CrawlConstants
         $activity = $_REQUEST['a'];
         $channel = $this->getChannel();
         $this->crawl_status_file_name =
-            C\SCHEDULES_DIR . "/{$channel}-CrawlStatus.txt";
+            C\SCHEDULES_DIR . "/{$channel}-" . self::crawl_status_file;
         $robot_table_name = C\WORK_DIRECTORY . "/{$channel}-" .
             self::robot_table_name;
         $robot_table = [];
diff --git a/src/controllers/MachineController.php b/src/controllers/MachineController.php
index 32f6c8290..8281e88e1 100644
--- a/src/controllers/MachineController.php
+++ b/src/controllers/MachineController.php
@@ -147,9 +147,10 @@ class MachineController extends Controller implements CrawlConstants
                 }
                 break;
             case 'Fetcher':
-                $id = $_REQUEST['id'];
+                $id = empty($_REQUEST['id']) ? 0 :
+                    $this->clean($_REQUEST['id'], "int");
                 if ($_REQUEST['action'] == "start" &&
-                    !isset($statuses["$channel-Fetcher"][$id ]) ) {
+                    !isset($statuses["$channel-Fetcher"][$id]) ) {
                     CrawlDaemon::start("Fetcher", "$id-$channel", "$channel");
                 } else if ($_REQUEST['action'] == "stop" &&
                     isset($statuses["$channel-Fetcher"][$id]) ) {
@@ -157,8 +158,8 @@ class MachineController extends Controller implements CrawlConstants
                 }
                 break;
             case 'RestartFetcher':
-                $error_log = C\CRASH_LOG_NAME;
-                $id = $_REQUEST['id'];
+                $id = empty($_REQUEST['id']) ? 0 :
+                    $this->clean($_REQUEST['id'], "int");
                 $msg = "Restarting $channel-Fetcher $id";
                 $time_string = date("r", time());
                 $out_msg = "[$time_string] $msg\n";
@@ -168,7 +169,7 @@ class MachineController extends Controller implements CrawlConstants
                 }
                 if (!file_exists($error_log) || filesize($error_log) >
                     C\MAX_LOG_FILE_SIZE) {
-                    /* use file_put_contents as filePutContetns doesn't
+                    /* use file_put_contents as filePutContents doesn't
                        support FILE_APPEND
                      */
                     file_put_contents($error_log, $out_msg);
diff --git a/src/executables/Mirror.php b/src/executables/Mirror.php
index 40cdec022..e3022e7b3 100644
--- a/src/executables/Mirror.php
+++ b/src/executables/Mirror.php
@@ -176,7 +176,7 @@ class Mirror implements CrawlConstants
                     continue;
                 }
             }
-            $parent_file = C\SCHEDULES_DIR . "/mirror_parent.txt";
+            $parent_file = C\SCHEDULES_DIR . "/MirrorParent.txt";
             if (file_exists($parent_file)) {
                 $this->parent_url = file_get_contents($parent_file);
                 L\crawlLog("Read File: " . $parent_file . ".");
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index fd9206280..d8cf42c97 100644
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -414,6 +414,8 @@ class QueueServer implements CrawlConstants
         $this->waiting_hosts = [];
         $this->server_name = "IndexerAndScheduler";
         $this->process_name = "0-QueueServer";
+        $this->crawl_status_file_name =
+            C\SCHEDULES_DIR . "/{$this->channel}-" . self::crawl_status_file;
         $this->debug = "";
     }
     /**
@@ -449,7 +451,7 @@ class QueueServer implements CrawlConstants
         }
         $this->process_name = $this->channel . "-QueueServer";
         $this->crawl_status_file_name =
-            C\SCHEDULES_DIR . "/{$this->channel}-CrawlStatus.txt";
+            C\SCHEDULES_DIR . "/{$this->channel}-" . self::crawl_status_file;
         L\crawlLog("\n\nInitialize logger..", $this->process_name, true);
         $this->server_name = "IndexerAndScheduler";
         if (isset($argv[3]) && $argv[1] == "child" &&
@@ -461,8 +463,7 @@ class QueueServer implements CrawlConstants
         L\crawlLog($this->server_name . " using channel " . $this->channel);
         $remove = false;
         $old_message_names = ["QueueServerMessages.txt",
-            "SchedulerMessages.txt", "CrawlStatus.txt",
-            "ScheduleStatus.txt"];
+            self::crawl_status_file, "ScheduleStatus.txt"];
         foreach ($old_message_names as $name) {
             if (file_exists(C\SCHEDULES_DIR . "/{$this->channel}-$name")) {
                 @unlink(C\SCHEDULES_DIR . "/{$this->channel}-$name");
@@ -497,7 +498,7 @@ class QueueServer implements CrawlConstants
         if ($this->isAIndexer()) {
             $this->deleteOrphanedBundles();
         }
-        $method_statistics_file = C\LOG_DIR . "/" . $this->process_name .
+        $method_statistics_file = C\LOG_DIR . "/" . $this->server_name .
             "Stats.log";
         L\measureCall(null, $method_statistics_file);
         L\crawlLog("Profiling info will be sent to: " .
@@ -684,7 +685,7 @@ class QueueServer implements CrawlConstants
             return;
         }
         $this->debug = "";
-        L\crawlLog( "$process seems to have died restarting...");
+        L\crawlLog( "$process seems to have died...");
         $process_lines = array_slice($process_lines, -10);
         $time_string = L\makeTimestamp();
         $out_msg = "$time_string $process died. Last log lines were:\n";
@@ -744,7 +745,6 @@ class QueueServer implements CrawlConstants
             }
             $delete_dirs = [
                 C\CACHE_DIR . "/" . self::queue_base_name . $crawl_time,
-                C\CACHE_DIR . "/" . self::index_data_base_name . $crawl_time,
                 C\SCHEDULES_DIR . "/" . self::messages_data_base_name .
                     $crawl_time,
                 C\SCHEDULES_DIR . "/" . self::name_archive_iterator .
@@ -964,14 +964,15 @@ class QueueServer implements CrawlConstants
     {
         $old_info = $info;
         $is_scheduler = $this->isOnlyScheduler();
-        $message_file = C\SCHEDULES_DIR . "/" . $this->process_name .
-            "Messages.txt";
+        $message_file = CrawlDaemon::getMessageFileName(
+            "QueueServer", $this->channel);
         L\crawlLog("{$this->server_type} is checking for queue server ".
             "messages in $message_file ...");
         if (file_exists($message_file)) {
             $info = unserialize(file_get_contents($message_file));
             if ($info === false) {
                 L\crawlLog("$message_file unserializable!!!!!");
+                unlink($message_file);
                 return $old_info;
             }
             if (isset($info[self::DEBUG])) {
@@ -985,11 +986,21 @@ class QueueServer implements CrawlConstants
                 if ($this->server_type == self::BOTH ||
                   (isset($info[self::INDEXER]) && isset($info[self::SCHEDULER]))
                   ) {
+                    // both processes have seen message so unlink and unmark
                     unlink($message_file);
+                    unset($info[self::INDEXER]);
+                    unset($info[self::SCHEDULER]);
                 } else {
-                    file_put_contents($message_file, serialize($info), LOCK_EX);
+                    // write that current process has seen message
+                    $num_bytes =
+                        file_put_contents($message_file, serialize($info),
+                        LOCK_EX);
+                    if (!$num_bytes) {
+                        return $old_info;
+                    }
                 }
             } else {
+                // already seen the message waiting for other process to see
                 return $old_info;
             }
             if (isset($info[self::FETCHER_QUEUE_SERVER_RATIO])) {
@@ -1004,12 +1015,12 @@ class QueueServer implements CrawlConstants
                 case "NEW_CRAWL":
                    if ($old_info[self::STATUS] == self::CONTINUE_STATE) {
                         if (!$is_scheduler) {
-                        L\crawlLog("Stopping previous crawl before start".
-                            " new crawl!");
+                            L\crawlLog("Stopping previous crawl before start".
+                                " new crawl!");
                         } else {
-                        L\crawlLog(
-                            "Scheduler stopping for previous crawl before".
-                            " new crawl!");
+                            L\crawlLog(
+                                "Scheduler stopping for previous crawl before".
+                                " new crawl!");
                         }
                         $this->stopCrawl();
                     }
@@ -1034,6 +1045,7 @@ class QueueServer implements CrawlConstants
                                 "archive with timestamp " .
                                 $this->crawl_index);
                         }
+                        // write the crawl status file not queue server messages
                         $this->writeAdminMessage("BEGIN_CRAWL");
                     } else {
                         L\crawlLog("Scheduler started for new crawl");
diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php
index 11d669df8..6ecc482e0 100755
--- a/src/library/CrawlConstants.php
+++ b/src/library/CrawlConstants.php
@@ -67,10 +67,16 @@ interface CrawlConstants
     const fetch_crawl_info = "FetchInfo";
     const fetch_closed_name = "FetchClosed";
     const schedule_start_name = "StartCrawlSchedule.txt";
+    /** File name  of file used to store when various fetchers contained a
+     *  given QueueServer machine
+     */
     const robot_table_name = "RobotTable.txt";
     const mirror_table_name = "MirrorTable.txt";
+    /**Filename used by FetchUrl::getPages for cached DNS lookups */
     const local_ip_cache_file = "LocalIpCache.txt";
+    /** used by MediaUpdater to know what machine it is for distributed jobs */
     const current_machine_info_file = "CurrentMachineInfo.txt";
+    const crawl_status_file = "CrawlStatus.txt";
     /** used for word iterator direction  */
     const ASCENDING = 1;
     const DESCENDING = -1;
diff --git a/src/library/CrawlDaemon.php b/src/library/CrawlDaemon.php
index 2160fdf43..d1f1838b7 100644
--- a/src/library/CrawlDaemon.php
+++ b/src/library/CrawlDaemon.php
@@ -206,7 +206,7 @@ class CrawlDaemon implements CrawlConstants
                 $options = "";
                 $quote = (strstr(PHP_OS, "WIN")) ? '' : '"';
                 for ($i = 3; $i < count($init_argv); $i++) {
-                    $options .= $quote . $init_argv[$i]. $quote . " ";
+                    $options .= $quote . $init_argv[$i] . $quote . " ";
                 }
                 $options = trim($options);
                 $subname = (!isset($init_argv[2]) || $init_argv[2] == 'none') ?
@@ -273,13 +273,31 @@ class CrawlDaemon implements CrawlConstants
      */
     public static function start($name, $subname = "", $options = "", $exit = 1)
     {
+        if (empty($name)) {
+            echo "Must provide a non-empty daemon name";
+            exit();
+        }
         $tmp_subname = ($subname == 'none') ? '' : $subname;
         $lock_file = CrawlDaemon::getLockFileName($name, $tmp_subname);
-        if (file_exists($lock_file) && ($exit < 3 && $exit > -3)) {
-            $time = intval(file_get_contents($lock_file));
+        $alt_subname =  ($subname == "0") ? "" : ($subname == "" ?
+            "0" : "-1");
+        $alt_lock_file = $lock_file;
+        if ($alt_subname != "-1") {
+            /* this is designed to handle a case where run as terminal
+               CTRL-C exit so still have a lock file, then start in background
+               a queue server
+             */
+            $alt_lock_file = CrawlDaemon::getLockFileName($name, $alt_subname);
+        }
+        if ((file_exists($lock_file) || file_exists($alt_lock_file))
+            && ($exit < 3 && $exit > -3)) {
+            $time =  (file_exists($lock_file)) ?
+                intval(file_get_contents($lock_file)) :
+                intval(file_get_contents($alt_lock_file));
             if (time() - $time < C\PROCESS_TIMEOUT) {
                 echo "$name appears to be already running...\n";
-                echo "Try stopping it first, then running start.";
+                echo "Try stopping it first (php process_name stop), ".
+                    "then running start.\n";
                 exit();
             }
         }
diff --git a/src/library/DoubleIndexBundle.php b/src/library/DoubleIndexBundle.php
index 515b7c388..24d56b233 100644
--- a/src/library/DoubleIndexBundle.php
+++ b/src/library/DoubleIndexBundle.php
@@ -245,7 +245,8 @@ class DoubleIndexBundle implements CrawlConstants
      * @param string $taking_too_long_touch name of file to touch if
      *      checking the update takes longer than LOG_TIMEOUT. To prevent
      *      a crawl from stopping because nothing is happening the
-     *      file usually supplied is SCHEDULES_DIR/CrawlStatus.txt
+     *      file usually supplied is C\SCHEDULES_DIR . "/{$this->channel}-" .
+     *      self::crawl_status_file
      */
     public function updateDictionary($taking_too_long_touch = null)
     {
diff --git a/src/library/FeedDocumentBundle.php b/src/library/FeedDocumentBundle.php
index 749cc2184..d199f2377 100644
--- a/src/library/FeedDocumentBundle.php
+++ b/src/library/FeedDocumentBundle.php
@@ -171,8 +171,9 @@ class FeedDocumentBundle extends IndexDocumentBundle
      *
      * @param int $partition bundle partition to build inverted index for
      * @param string $taking_too_long_touch name of file to touch if building
-     *  inverted index takes too long (whether SCHEDULES_DIR/CrawlStatus.txt)
-     *  has been recently modified is used in crawling to see if have run out
+     *  inverted index takes too long (whether SCHEDULES_DIR/ .
+     *  "/{$this->channel}-" . CrawmConstants::crawl_status_file
+     *  has been recently modified) is used in crawling to see if have run out
      *  of new data and the crawl can stopped.
      * @param bool $just_stats whether to just compute stats on the inverted
      *      or to actually  save the results
diff --git a/src/library/FetchUrl.php b/src/library/FetchUrl.php
index a4869e840..d14cc47d0 100755
--- a/src/library/FetchUrl.php
+++ b/src/library/FetchUrl.php
@@ -281,7 +281,7 @@ class FetchUrl implements CrawlConstants
         }
         if (!empty(self::$local_ip_cache['dirty'])) {
             unset(self::$local_ip_cache['dirty']);
-            file_put_contents("$temp_dir/". self::local_ip_cache_file,
+            file_put_contents("$temp_dir/" . self::local_ip_cache_file,
                 serialize(self::$local_ip_cache));
         }
         if ($timer) {
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index 3d1ddf5fa..8cbb9a2b5 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -399,9 +399,9 @@ class IndexDocumentBundle implements CrawlConstants
      *
      * @param string $taking_too_long_touch a filename of a file to touch
      *  so its last modified time becomes the current time. In a typical
-     *  Yioop crawl this is done for the CrawlStatus.txt file to prevent
-     *  Yioop's web interface from stopping the crawl because it has seen
-     *  no recent  progress activity on a crawl.
+     *  Yioop crawl this is done for the CrawlConstants::crawl_status_file
+     *  file to prevent  Yioop's web interface from stopping the crawl because
+     *  it has seen no recent  progress activity on a crawl.
      * @param bool $till_equal is set to true will keep adding each partition
      *  up till the save partition if set to false, oln;y adds one partition
      */
@@ -456,9 +456,9 @@ class IndexDocumentBundle implements CrawlConstants
      *  default the current save partition
      * @param string $taking_too_long_touch a filename of a file to touch
      *  so its last modified time becomes the current time. In a typical
-     *  Yioop crawl this is done for the CrawlStatus.txt file to prevent
-     *  Yioop's web interface from stopping the crawl because it has seen
-     *  no recent  progress activity on a crawl.
+     *  Yioop crawl this is done for the CrawlConstants::crawl_status_file
+     *  file to prevent Yioop's web interface from stopping the crawl because
+     *  it has seen no recent  progress activity on a crawl.
      */
     public function addPartitionPostingsDictionary($partition = -1,
         $taking_too_long_touch = null)
@@ -602,9 +602,9 @@ class IndexDocumentBundle implements CrawlConstants
      * @param int $partition to build index for
      * @param string $taking_too_long_touch a filename of a file to touch
      *  so its last modified time becomes the current time. In a typical
-     *  Yioop crawl this is done for the CrawlStatus.txt file to prevent
-     *  Yioop's web interface from stopping the crawl because it has seen
-     *  no recent  progress activity on a crawl.
+     *  Yioop crawl this is done for the CrawlConstants::crawl_status_file
+     *  file to prevent Yioop's web interface from stopping the crawl because
+     *  it has seen no recent  progress activity on a crawl.
      * @return mixed whether job executed to completion (true or false) if
      *  !$just_stats, otherwise, an array with NUM_DOCS, NUM_LINKS,
      *  and TERM_STATISTICS (the latter having term frequency info)
diff --git a/src/models/CrawlModel.php b/src/models/CrawlModel.php
index fe01bcc7f..f2dd0f16a 100755
--- a/src/models/CrawlModel.php
+++ b/src/models/CrawlModel.php
@@ -1329,7 +1329,7 @@ EOT;
         }
         $stat_prefix = C\SCHEDULES_DIR . "/";
         $stat_prefix_len = strlen($stat_prefix);
-        $stat_suffix = "-CrawlStatus.txt";
+        $stat_suffix = "-" . self::crawl_status_file;
         $stat_suffix_len = strlen($stat_suffix);
         $stat_files = glob("$stat_prefix*$stat_suffix");
         $statuses = [];
@@ -1338,8 +1338,8 @@ EOT;
             $crawl_status = unserialize(file_get_contents($stat_file));
             $non_repeating = (empty($crawl_status["REPEAT_TYPE"]) ||
                 intval($crawl_status["REPEAT_TYPE"]) < 0);
-            /* CrawlStatus.txt file will be updated any time data sent from
-               fetcher via FetchController update method.
+            /* CrawlConstants::crawl_status_file file be updated any time data
+               sent from fetcher via FetchController update method.
                If no new data has arrived for CRAWL_TIMEOUT amount of time
                assume crawl not active */
             if ($non_repeating && filemtime($stat_file) +
@@ -1402,7 +1402,7 @@ EOT;
         }
         $stat_prefix = C\SCHEDULES_DIR . "/";
         $stat_prefix_len = strlen($stat_prefix);
-        $stat_suffix = "-CrawlStatus.txt";
+        $stat_suffix = "-" . self::crawl_status_file;
         $stat_suffix_len = strlen($stat_suffix);
         $stat_files = glob("$stat_prefix*$stat_suffix");
         $data = [];

ViewGit