diff --git a/src/configs/Config.php b/src/configs/Config.php index e583469ee..bd880e6cd 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -992,8 +992,9 @@ nsconddefine('CRASH_LOG_NAME', LOG_DIR . "/YioopCrashes.log"); */ nsconddefine('PROCESS_TIMEOUT', 15 * ONE_MINUTE); /** Number of seconds of no fetcher contact before crawl is deemed dead - * The files C\SCHEDULES_DIR . "/{$this->channel}-CrawlStatus.txt" - * is used to determine if CRAWL_TIMEOUT reached. + * The files C\SCHEDULES_DIR . "/{$this->channel}-" . + * L\CrawlConstants::crawl_status_file + * are used to determine if CRAWL_TIMEOUT reached. * This is modified by QueueServer::writeAdminMessages only when * the crawl state (waiting/start crawl/ shutdown, etc) changes. * It is also updated when a fetcher sends an update command to diff --git a/src/controllers/FetchController.php b/src/controllers/FetchController.php index b33e9cc9f..7c4445e52 100755 --- a/src/controllers/FetchController.php +++ b/src/controllers/FetchController.php @@ -59,7 +59,7 @@ class FetchController extends Controller implements CrawlConstants * @var string */ public $crawl_status_file_name = - C\SCHEDULES_DIR . "/0-CrawlStatus.txt"; + C\SCHEDULES_DIR . "/0-" . self::crawl_status_file; /** * Number of seconds that must elapse after last call before doing * cron activities (mainly check liveness of fetchers which should be @@ -91,7 +91,7 @@ class FetchController extends Controller implements CrawlConstants $activity = $_REQUEST['a']; $channel = $this->getChannel(); $this->crawl_status_file_name = - C\SCHEDULES_DIR . "/{$channel}-CrawlStatus.txt"; + C\SCHEDULES_DIR . "/{$channel}-" . self::crawl_status_file; $robot_table_name = C\WORK_DIRECTORY . "/{$channel}-" . self::robot_table_name; $robot_table = []; diff --git a/src/controllers/MachineController.php b/src/controllers/MachineController.php index 32f6c8290..8281e88e1 100644 --- a/src/controllers/MachineController.php +++ b/src/controllers/MachineController.php @@ -147,9 +147,10 @@ class MachineController extends Controller implements CrawlConstants } break; case 'Fetcher': - $id = $_REQUEST['id']; + $id = empty($_REQUEST['id']) ? 0 : + $this->clean($_REQUEST['id'], "int"); if ($_REQUEST['action'] == "start" && - !isset($statuses["$channel-Fetcher"][$id ]) ) { + !isset($statuses["$channel-Fetcher"][$id]) ) { CrawlDaemon::start("Fetcher", "$id-$channel", "$channel"); } else if ($_REQUEST['action'] == "stop" && isset($statuses["$channel-Fetcher"][$id]) ) { @@ -157,8 +158,8 @@ class MachineController extends Controller implements CrawlConstants } break; case 'RestartFetcher': - $error_log = C\CRASH_LOG_NAME; - $id = $_REQUEST['id']; + $id = empty($_REQUEST['id']) ? 0 : + $this->clean($_REQUEST['id'], "int"); $msg = "Restarting $channel-Fetcher $id"; $time_string = date("r", time()); $out_msg = "[$time_string] $msg\n"; @@ -168,7 +169,7 @@ class MachineController extends Controller implements CrawlConstants } if (!file_exists($error_log) || filesize($error_log) > C\MAX_LOG_FILE_SIZE) { - /* use file_put_contents as filePutContetns doesn't + /* use file_put_contents as filePutContents doesn't support FILE_APPEND */ file_put_contents($error_log, $out_msg); diff --git a/src/executables/Mirror.php b/src/executables/Mirror.php index 40cdec022..e3022e7b3 100644 --- a/src/executables/Mirror.php +++ b/src/executables/Mirror.php @@ -176,7 +176,7 @@ class Mirror implements CrawlConstants continue; } } - $parent_file = C\SCHEDULES_DIR . "/mirror_parent.txt"; + $parent_file = C\SCHEDULES_DIR . "/MirrorParent.txt"; if (file_exists($parent_file)) { $this->parent_url = file_get_contents($parent_file); L\crawlLog("Read File: " . $parent_file . "."); diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index fd9206280..d8cf42c97 100644 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -414,6 +414,8 @@ class QueueServer implements CrawlConstants $this->waiting_hosts = []; $this->server_name = "IndexerAndScheduler"; $this->process_name = "0-QueueServer"; + $this->crawl_status_file_name = + C\SCHEDULES_DIR . "/{$this->channel}-" . self::crawl_status_file; $this->debug = ""; } /** @@ -449,7 +451,7 @@ class QueueServer implements CrawlConstants } $this->process_name = $this->channel . "-QueueServer"; $this->crawl_status_file_name = - C\SCHEDULES_DIR . "/{$this->channel}-CrawlStatus.txt"; + C\SCHEDULES_DIR . "/{$this->channel}-" . self::crawl_status_file; L\crawlLog("\n\nInitialize logger..", $this->process_name, true); $this->server_name = "IndexerAndScheduler"; if (isset($argv[3]) && $argv[1] == "child" && @@ -461,8 +463,7 @@ class QueueServer implements CrawlConstants L\crawlLog($this->server_name . " using channel " . $this->channel); $remove = false; $old_message_names = ["QueueServerMessages.txt", - "SchedulerMessages.txt", "CrawlStatus.txt", - "ScheduleStatus.txt"]; + self::crawl_status_file, "ScheduleStatus.txt"]; foreach ($old_message_names as $name) { if (file_exists(C\SCHEDULES_DIR . "/{$this->channel}-$name")) { @unlink(C\SCHEDULES_DIR . "/{$this->channel}-$name"); @@ -497,7 +498,7 @@ class QueueServer implements CrawlConstants if ($this->isAIndexer()) { $this->deleteOrphanedBundles(); } - $method_statistics_file = C\LOG_DIR . "/" . $this->process_name . + $method_statistics_file = C\LOG_DIR . "/" . $this->server_name . "Stats.log"; L\measureCall(null, $method_statistics_file); L\crawlLog("Profiling info will be sent to: " . @@ -684,7 +685,7 @@ class QueueServer implements CrawlConstants return; } $this->debug = ""; - L\crawlLog( "$process seems to have died restarting..."); + L\crawlLog( "$process seems to have died..."); $process_lines = array_slice($process_lines, -10); $time_string = L\makeTimestamp(); $out_msg = "$time_string $process died. Last log lines were:\n"; @@ -744,7 +745,6 @@ class QueueServer implements CrawlConstants } $delete_dirs = [ C\CACHE_DIR . "/" . self::queue_base_name . $crawl_time, - C\CACHE_DIR . "/" . self::index_data_base_name . $crawl_time, C\SCHEDULES_DIR . "/" . self::messages_data_base_name . $crawl_time, C\SCHEDULES_DIR . "/" . self::name_archive_iterator . @@ -964,14 +964,15 @@ class QueueServer implements CrawlConstants { $old_info = $info; $is_scheduler = $this->isOnlyScheduler(); - $message_file = C\SCHEDULES_DIR . "/" . $this->process_name . - "Messages.txt"; + $message_file = CrawlDaemon::getMessageFileName( + "QueueServer", $this->channel); L\crawlLog("{$this->server_type} is checking for queue server ". "messages in $message_file ..."); if (file_exists($message_file)) { $info = unserialize(file_get_contents($message_file)); if ($info === false) { L\crawlLog("$message_file unserializable!!!!!"); + unlink($message_file); return $old_info; } if (isset($info[self::DEBUG])) { @@ -985,11 +986,21 @@ class QueueServer implements CrawlConstants if ($this->server_type == self::BOTH || (isset($info[self::INDEXER]) && isset($info[self::SCHEDULER])) ) { + // both processes have seen message so unlink and unmark unlink($message_file); + unset($info[self::INDEXER]); + unset($info[self::SCHEDULER]); } else { - file_put_contents($message_file, serialize($info), LOCK_EX); + // write that current process has seen message + $num_bytes = + file_put_contents($message_file, serialize($info), + LOCK_EX); + if (!$num_bytes) { + return $old_info; + } } } else { + // already seen the message waiting for other process to see return $old_info; } if (isset($info[self::FETCHER_QUEUE_SERVER_RATIO])) { @@ -1004,12 +1015,12 @@ class QueueServer implements CrawlConstants case "NEW_CRAWL": if ($old_info[self::STATUS] == self::CONTINUE_STATE) { if (!$is_scheduler) { - L\crawlLog("Stopping previous crawl before start". - " new crawl!"); + L\crawlLog("Stopping previous crawl before start". + " new crawl!"); } else { - L\crawlLog( - "Scheduler stopping for previous crawl before". - " new crawl!"); + L\crawlLog( + "Scheduler stopping for previous crawl before". + " new crawl!"); } $this->stopCrawl(); } @@ -1034,6 +1045,7 @@ class QueueServer implements CrawlConstants "archive with timestamp " . $this->crawl_index); } + // write the crawl status file not queue server messages $this->writeAdminMessage("BEGIN_CRAWL"); } else { L\crawlLog("Scheduler started for new crawl"); diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php index 11d669df8..6ecc482e0 100755 --- a/src/library/CrawlConstants.php +++ b/src/library/CrawlConstants.php @@ -67,10 +67,16 @@ interface CrawlConstants const fetch_crawl_info = "FetchInfo"; const fetch_closed_name = "FetchClosed"; const schedule_start_name = "StartCrawlSchedule.txt"; + /** File name of file used to store when various fetchers contained a + * given QueueServer machine + */ const robot_table_name = "RobotTable.txt"; const mirror_table_name = "MirrorTable.txt"; + /**Filename used by FetchUrl::getPages for cached DNS lookups */ const local_ip_cache_file = "LocalIpCache.txt"; + /** used by MediaUpdater to know what machine it is for distributed jobs */ const current_machine_info_file = "CurrentMachineInfo.txt"; + const crawl_status_file = "CrawlStatus.txt"; /** used for word iterator direction */ const ASCENDING = 1; const DESCENDING = -1; diff --git a/src/library/CrawlDaemon.php b/src/library/CrawlDaemon.php index 2160fdf43..d1f1838b7 100644 --- a/src/library/CrawlDaemon.php +++ b/src/library/CrawlDaemon.php @@ -206,7 +206,7 @@ class CrawlDaemon implements CrawlConstants $options = ""; $quote = (strstr(PHP_OS, "WIN")) ? '' : '"'; for ($i = 3; $i < count($init_argv); $i++) { - $options .= $quote . $init_argv[$i]. $quote . " "; + $options .= $quote . $init_argv[$i] . $quote . " "; } $options = trim($options); $subname = (!isset($init_argv[2]) || $init_argv[2] == 'none') ? @@ -273,13 +273,31 @@ class CrawlDaemon implements CrawlConstants */ public static function start($name, $subname = "", $options = "", $exit = 1) { + if (empty($name)) { + echo "Must provide a non-empty daemon name"; + exit(); + } $tmp_subname = ($subname == 'none') ? '' : $subname; $lock_file = CrawlDaemon::getLockFileName($name, $tmp_subname); - if (file_exists($lock_file) && ($exit < 3 && $exit > -3)) { - $time = intval(file_get_contents($lock_file)); + $alt_subname = ($subname == "0") ? "" : ($subname == "" ? + "0" : "-1"); + $alt_lock_file = $lock_file; + if ($alt_subname != "-1") { + /* this is designed to handle a case where run as terminal + CTRL-C exit so still have a lock file, then start in background + a queue server + */ + $alt_lock_file = CrawlDaemon::getLockFileName($name, $alt_subname); + } + if ((file_exists($lock_file) || file_exists($alt_lock_file)) + && ($exit < 3 && $exit > -3)) { + $time = (file_exists($lock_file)) ? + intval(file_get_contents($lock_file)) : + intval(file_get_contents($alt_lock_file)); if (time() - $time < C\PROCESS_TIMEOUT) { echo "$name appears to be already running...\n"; - echo "Try stopping it first, then running start."; + echo "Try stopping it first (php process_name stop), ". + "then running start.\n"; exit(); } } diff --git a/src/library/DoubleIndexBundle.php b/src/library/DoubleIndexBundle.php index 515b7c388..24d56b233 100644 --- a/src/library/DoubleIndexBundle.php +++ b/src/library/DoubleIndexBundle.php @@ -245,7 +245,8 @@ class DoubleIndexBundle implements CrawlConstants * @param string $taking_too_long_touch name of file to touch if * checking the update takes longer than LOG_TIMEOUT. To prevent * a crawl from stopping because nothing is happening the - * file usually supplied is SCHEDULES_DIR/CrawlStatus.txt + * file usually supplied is C\SCHEDULES_DIR . "/{$this->channel}-" . + * self::crawl_status_file */ public function updateDictionary($taking_too_long_touch = null) { diff --git a/src/library/FeedDocumentBundle.php b/src/library/FeedDocumentBundle.php index 749cc2184..d199f2377 100644 --- a/src/library/FeedDocumentBundle.php +++ b/src/library/FeedDocumentBundle.php @@ -171,8 +171,9 @@ class FeedDocumentBundle extends IndexDocumentBundle * * @param int $partition bundle partition to build inverted index for * @param string $taking_too_long_touch name of file to touch if building - * inverted index takes too long (whether SCHEDULES_DIR/CrawlStatus.txt) - * has been recently modified is used in crawling to see if have run out + * inverted index takes too long (whether SCHEDULES_DIR/ . + * "/{$this->channel}-" . CrawmConstants::crawl_status_file + * has been recently modified) is used in crawling to see if have run out * of new data and the crawl can stopped. * @param bool $just_stats whether to just compute stats on the inverted * or to actually save the results diff --git a/src/library/FetchUrl.php b/src/library/FetchUrl.php index a4869e840..d14cc47d0 100755 --- a/src/library/FetchUrl.php +++ b/src/library/FetchUrl.php @@ -281,7 +281,7 @@ class FetchUrl implements CrawlConstants } if (!empty(self::$local_ip_cache['dirty'])) { unset(self::$local_ip_cache['dirty']); - file_put_contents("$temp_dir/". self::local_ip_cache_file, + file_put_contents("$temp_dir/" . self::local_ip_cache_file, serialize(self::$local_ip_cache)); } if ($timer) { diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php index 3d1ddf5fa..8cbb9a2b5 100644 --- a/src/library/IndexDocumentBundle.php +++ b/src/library/IndexDocumentBundle.php @@ -399,9 +399,9 @@ class IndexDocumentBundle implements CrawlConstants * * @param string $taking_too_long_touch a filename of a file to touch * so its last modified time becomes the current time. In a typical - * Yioop crawl this is done for the CrawlStatus.txt file to prevent - * Yioop's web interface from stopping the crawl because it has seen - * no recent progress activity on a crawl. + * Yioop crawl this is done for the CrawlConstants::crawl_status_file + * file to prevent Yioop's web interface from stopping the crawl because + * it has seen no recent progress activity on a crawl. * @param bool $till_equal is set to true will keep adding each partition * up till the save partition if set to false, oln;y adds one partition */ @@ -456,9 +456,9 @@ class IndexDocumentBundle implements CrawlConstants * default the current save partition * @param string $taking_too_long_touch a filename of a file to touch * so its last modified time becomes the current time. In a typical - * Yioop crawl this is done for the CrawlStatus.txt file to prevent - * Yioop's web interface from stopping the crawl because it has seen - * no recent progress activity on a crawl. + * Yioop crawl this is done for the CrawlConstants::crawl_status_file + * file to prevent Yioop's web interface from stopping the crawl because + * it has seen no recent progress activity on a crawl. */ public function addPartitionPostingsDictionary($partition = -1, $taking_too_long_touch = null) @@ -602,9 +602,9 @@ class IndexDocumentBundle implements CrawlConstants * @param int $partition to build index for * @param string $taking_too_long_touch a filename of a file to touch * so its last modified time becomes the current time. In a typical - * Yioop crawl this is done for the CrawlStatus.txt file to prevent - * Yioop's web interface from stopping the crawl because it has seen - * no recent progress activity on a crawl. + * Yioop crawl this is done for the CrawlConstants::crawl_status_file + * file to prevent Yioop's web interface from stopping the crawl because + * it has seen no recent progress activity on a crawl. * @return mixed whether job executed to completion (true or false) if * !$just_stats, otherwise, an array with NUM_DOCS, NUM_LINKS, * and TERM_STATISTICS (the latter having term frequency info) diff --git a/src/models/CrawlModel.php b/src/models/CrawlModel.php index fe01bcc7f..f2dd0f16a 100755 --- a/src/models/CrawlModel.php +++ b/src/models/CrawlModel.php @@ -1329,7 +1329,7 @@ EOT; } $stat_prefix = C\SCHEDULES_DIR . "/"; $stat_prefix_len = strlen($stat_prefix); - $stat_suffix = "-CrawlStatus.txt"; + $stat_suffix = "-" . self::crawl_status_file; $stat_suffix_len = strlen($stat_suffix); $stat_files = glob("$stat_prefix*$stat_suffix"); $statuses = []; @@ -1338,8 +1338,8 @@ EOT; $crawl_status = unserialize(file_get_contents($stat_file)); $non_repeating = (empty($crawl_status["REPEAT_TYPE"]) || intval($crawl_status["REPEAT_TYPE"]) < 0); - /* CrawlStatus.txt file will be updated any time data sent from - fetcher via FetchController update method. + /* CrawlConstants::crawl_status_file file be updated any time data + sent from fetcher via FetchController update method. If no new data has arrived for CRAWL_TIMEOUT amount of time assume crawl not active */ if ($non_repeating && filemtime($stat_file) + @@ -1402,7 +1402,7 @@ EOT; } $stat_prefix = C\SCHEDULES_DIR . "/"; $stat_prefix_len = strlen($stat_prefix); - $stat_suffix = "-CrawlStatus.txt"; + $stat_suffix = "-" . self::crawl_status_file; $stat_suffix_len = strlen($stat_suffix); $stat_files = glob("$stat_prefix*$stat_suffix"); $data = [];