Make it so the initial seed sites which can be pasted into web form can be as many as 100000, a=chris
Make it so the initial seed sites which can be pasted into web form can be as many as 100000, a=chris
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 3706299f2..6f4b5f6fd 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -1861,6 +1861,17 @@ class QueueServer implements CrawlConstants, Join
public function processDataArchive($file)
{
$sites = & $this->getDataArchiveFileData($file);
+ if (isset($sites[self::TO_CRAWL]) &&
+ count($sites[self::TO_CRAWL]) > C\MAX_FETCH_SIZE) {
+ L\crawlLog("...New URLS file has too many to crawl sites ...");
+ L\crawlLog("...Splitting first into smaller files ".
+ "before processing...");
+ $left_over = array_slice($sites[self::TO_CRAWL], C\MAX_FETCH_SIZE);
+ $sites[self::TO_CRAWL] = array_slice($sites[self::TO_CRAWL], 0,
+ C\MAX_FETCH_SIZE);
+ $this->dumpBigScheduleToSmall($left_over);
+ unset($left_over);
+ }
L\crawlLog("...Scheduler Updating Delayed Hosts Array Queue ...");
$start_time = microtime(true);
if (isset($sites[self::SCHEDULE_TIME])) {
@@ -1998,6 +2009,60 @@ class QueueServer implements CrawlConstants, Join
L\crawlLog("Scheduler: Done queue schedule file: $file");
unlink($file);
}
+ /**
+ * Used to split a large schedule of to crawl sites into small
+ * ones (which are written to disk) which can be handled by
+ * processDataArchive
+ *
+ * It is possible that a large schedule file is created if someone
+ * pastes more than MAX_FETCH_SIZE many urls into the initial seed sites
+ * of a crawl in the UI.
+ */
+ public function dumpBigScheduleToSmall(&$sites)
+ {
+ L\crawlLog("Writing small schedules...");
+ $dir = C\CRAWL_DIR."/schedules/".self::schedule_data_base_name.
+ $this->crawl_time;
+ if (!file_exists($dir)) {
+ mkdir($dir);
+ chmod($dir, 0777);
+ }
+ $now = time();
+ $day = floor($this->crawl_time/C\ONE_DAY) - 1;
+ //want before all other schedules, so will be reloaded first
+ $dir .= "/$day";
+ if (!file_exists($dir)) {
+ mkdir($dir);
+ chmod($dir, 0777);
+ }
+ //get rid of previous restart attempts, if present
+ if (!$for_reschedule) {
+ $this->db->unlinkRecursive($dir, false);
+ }
+ $count = count($sites);
+ $schedule_data = [];
+ $schedule_data[self::SCHEDULE_TIME] = $this->crawl_time;
+ $schedule_data[self::TO_CRAWL] = [];
+ for ($i = 0; $i < $count; $i += C\MAX_FETCH_SIZE) {
+ L\crawlTimeoutLog("..have written %s urls of %s urls so far", $i,
+ $count);
+ $schedule_data[self::TO_CRAWL] = array_slice($sites, $i,
+ C\MAX_FETCH_SIZE);
+ if (count($schedule_data[self::TO_CRAWL]) > 0) {
+ $data_string = L\webencode(
+ gzcompress(serialize($schedule_data)));
+ $data_hash = L\crawlHash($data_string);
+ file_put_contents($dir."/At".$schedule_time."From127-0-0-1".
+ "WithHash$data_hash.txt", $data_string);
+ $data_string = "";
+ $schedule_data[self::TO_CRAWL] = [];
+ }
+ }
+ $this->db->setWorldPermissionsRecursive(
+ C\CRAWL_DIR.'/cache/'.
+ self::queue_base_name . $this->crawl_time);
+ $this->db->setWorldPermissionsRecursive($dir);
+ }
/**
* Writes status information about the current crawl so that the webserver
* app can use it for its display.