Make it so the initial seed sites which can be pasted into web form can be as many as 100000, a=chris

Chris Pollett [2015-08-26 19:Aug:th]

Make it so the initial seed sites which can be pasted into web form can be as many as 100000, a=chris

Filename
src/executables/QueueServer.php

diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 3706299f2..6f4b5f6fd 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -1861,6 +1861,17 @@ class QueueServer implements CrawlConstants, Join
     public function processDataArchive($file)
     {
         $sites = & $this->getDataArchiveFileData($file);
+        if (isset($sites[self::TO_CRAWL]) &&
+            count($sites[self::TO_CRAWL]) > C\MAX_FETCH_SIZE) {
+            L\crawlLog("...New URLS file has too many to crawl sites ...");
+            L\crawlLog("...Splitting first into smaller files ".
+                "before processing...");
+            $left_over = array_slice($sites[self::TO_CRAWL], C\MAX_FETCH_SIZE);
+            $sites[self::TO_CRAWL] = array_slice($sites[self::TO_CRAWL], 0,
+                C\MAX_FETCH_SIZE);
+            $this->dumpBigScheduleToSmall($left_over);
+            unset($left_over);
+        }
         L\crawlLog("...Scheduler Updating Delayed Hosts Array Queue ...");
         $start_time = microtime(true);
         if (isset($sites[self::SCHEDULE_TIME])) {
@@ -1998,6 +2009,60 @@ class QueueServer implements CrawlConstants, Join
         L\crawlLog("Scheduler: Done queue schedule file: $file");
         unlink($file);
     }
+    /**
+     * Used to split a large schedule of to crawl sites into small
+     * ones (which are written to disk) which can be handled by
+     * processDataArchive
+     *
+     * It is possible that a large schedule file is created if someone
+     * pastes more than MAX_FETCH_SIZE many urls into the initial seed sites
+     * of a crawl in the  UI.
+     */
+    public function dumpBigScheduleToSmall(&$sites)
+    {
+        L\crawlLog("Writing small schedules...");
+        $dir = C\CRAWL_DIR."/schedules/".self::schedule_data_base_name.
+            $this->crawl_time;
+        if (!file_exists($dir)) {
+            mkdir($dir);
+            chmod($dir, 0777);
+        }
+        $now = time();
+        $day = floor($this->crawl_time/C\ONE_DAY) - 1;
+            //want before all other schedules, so will be reloaded first
+        $dir .= "/$day";
+        if (!file_exists($dir)) {
+            mkdir($dir);
+            chmod($dir, 0777);
+        }
+        //get rid of previous restart attempts, if present
+        if (!$for_reschedule) {
+            $this->db->unlinkRecursive($dir, false);
+        }
+        $count = count($sites);
+        $schedule_data = [];
+        $schedule_data[self::SCHEDULE_TIME] = $this->crawl_time;
+        $schedule_data[self::TO_CRAWL] = [];
+        for ($i = 0; $i < $count; $i += C\MAX_FETCH_SIZE) {
+            L\crawlTimeoutLog("..have written %s urls of %s urls so far", $i,
+                $count);
+            $schedule_data[self::TO_CRAWL] = array_slice($sites, $i,
+                C\MAX_FETCH_SIZE);
+            if (count($schedule_data[self::TO_CRAWL]) > 0) {
+                $data_string = L\webencode(
+                    gzcompress(serialize($schedule_data)));
+                $data_hash = L\crawlHash($data_string);
+                file_put_contents($dir."/At".$schedule_time."From127-0-0-1".
+                    "WithHash$data_hash.txt", $data_string);
+                $data_string = "";
+                $schedule_data[self::TO_CRAWL] = [];
+            }
+        }
+        $this->db->setWorldPermissionsRecursive(
+            C\CRAWL_DIR.'/cache/'.
+            self::queue_base_name . $this->crawl_time);
+        $this->db->setWorldPermissionsRecursive($dir);
+    }
     /**
      * Writes status information about the current crawl so that the webserver
      * app can use it for its display.

ViewGit