Fix double encoding issue in search queries, clean up some QueueServer code, a=chris

Chris Pollett [2019-07-05 23:Jul:th]
Fix double encoding issue in search queries, clean up some QueueServer code, a=chris
Filename
src/controllers/FetchController.php
src/controllers/SearchController.php
src/controllers/components/CrawlComponent.php
src/executables/Fetcher.php
src/executables/QueueServer.php
src/library/BloomFilterBundle.php
diff --git a/src/controllers/FetchController.php b/src/controllers/FetchController.php
index a50e96a63..6c1d2d0a8 100755
--- a/src/controllers/FetchController.php
+++ b/src/controllers/FetchController.php
@@ -288,12 +288,13 @@ class FetchController extends Controller implements CrawlConstants
             $crawl_time = 0;
             $check_crawl_time = 0;
         }
+        $channel = $this->getChannel();
         $index_schedule_file = C\CRAWL_DIR . "/schedules/" .
             self::index_closed_name . $crawl_time . ".txt";
         if ($crawl_time > 0 && file_exists($index_schedule_file) &&
             $check_crawl_time > intval(fileatime($index_schedule_file)) &&
             !file_exists(C\CRAWL_DIR .
-                "/schedules/QueueServerMessages.txt") ) {
+                "/schedules/$channel-QueueServerMessages.txt") ) {
             $restart = true;
             if (file_exists($this->crawl_status_file_name)) {
                 $crawl_status = unserialize(file_get_contents(
diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php
index 3798f8562..8ad9f87d2 100755
--- a/src/controllers/SearchController.php
+++ b/src/controllers/SearchController.php
@@ -321,7 +321,7 @@ class SearchController extends Controller implements CrawlConstants
                     $this->subsearch_identifier = $search["INDEX_IDENTIFIER"];
                     if (!isset($_REQUEST['num']) &&
                         isset($search["PER_PAGE"])) {
-                        $_REQUEST['num']= $search["PER_PAGE"];
+                        $_REQUEST['num'] = $search["PER_PAGE"];
                     }
                     break;
                 }
@@ -864,8 +864,7 @@ class SearchController extends Controller implements CrawlConstants
                 if (!empty($this->subsearch_name)) {
                     $data['PAGING_QUERY']['s'] = $this->subsearch_name;
                 }
-                $data['QUERY'] = urlencode($this->clean($data['QUERY'],
-                    "string"));
+                $data['QUERY'] = urlencode($data['QUERY']);
                 break;
             case "query":
                 // no break
@@ -885,7 +884,7 @@ class SearchController extends Controller implements CrawlConstants
                 if (!empty($this->subsearch_name)) {
                     $data['PAGING_QUERY']['s'] =  $this->subsearch_name;
                 }
-                $data['QUERY'] = urlencode($this->clean($query, "string"));
+                $data['QUERY'] = urlencode($query);
                 if ((php_sapi_name() != 'cli' ||
                     C\nsdefined("IS_OWN_WEB_SERVER")) &&
                     C\nsdefined("MONETIZATION_TYPE") &&
diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php
index 35482421c..15cdc19a2 100644
--- a/src/controllers/components/CrawlComponent.php
+++ b/src/controllers/components/CrawlComponent.php
@@ -328,7 +328,7 @@ class CrawlComponent extends Component implements CrawlConstants
             is_array($crawl_params[self::INDEXING_PLUGINS])) {
             foreach ($crawl_params[self::INDEXING_PLUGINS] as $plugin) {
                 if ($plugin == "") {continue;}
-                $plugin_class = C\NS_PLUGINS . $plugin."Plugin";
+                $plugin_class = C\NS_PLUGINS . $plugin . "Plugin";
                 $plugin_obj = $parent->plugin(lcfirst($plugin));
                 if (method_exists($plugin_class, "loadConfiguration")) {
                     $crawl_params[self::INDEXING_PLUGINS_DATA][$plugin] =
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index a5e84143c..9a6a5ed3f 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -665,7 +665,9 @@ class Fetcher implements CrawlConstants
                $info[self::STATUS] being set
              */
             if (!isset($info[self::STATUS])) {
-                if ($info === true) {$info = [];}
+                if ($info === true) {
+                    $info = [];
+                }
                 $info[self::STATUS] = self::CONTINUE_STATE;
             }
             if ($info[self::STATUS] == self::NO_DATA_STATE) {
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index b312340b9..96a14b448 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -336,6 +336,7 @@ class QueueServer implements CrawlConstants, Join
         $this->archive_modified_time = 0;
         $this->crawl_time = 0;
         $this->channel = 0;
+        $this->repeat_type = -1;
         $this->robots_txt = C\ALWAYS_FOLLOW_ROBOTS;
         $this->cache_pages = true;
         $this->page_recrawl_frequency = C\PAGE_RECRAWL_FREQUENCY;
@@ -397,7 +398,11 @@ class QueueServer implements CrawlConstants, Join
             "schedule_status.txt"];
         foreach ($old_message_names as $name) {
             if (file_exists(C\CRAWL_DIR."/schedules/{$this->channel}-$name")) {
-                @unlink(C\CRAWL_DIR."/schedules/{$this->channel}-$name");
+                @unlink(C\CRAWL_DIR . "/schedules/{$this->channel}-$name");
+                $remove = true;
+            }
+            if (file_exists(C\CRAWL_DIR."/schedules/$name")) {
+                @unlink(C\CRAWL_DIR . "/schedules/$name");
                 $remove = true;
             }
         }
@@ -587,7 +592,7 @@ class QueueServer implements CrawlConstants, Join
         $init_args = ["QueueServer.php", "start", $this->channel, $process];
         L\crawlLog( "!!!!Writing to $error_log ".
             "crash message about $process...");
-        CrawlDaemon::init( $init_args, "QueueServer", -3);
+        CrawlDaemon::init($init_args, "QueueServer", -3);
         if ($info[self::STATUS] != self::WAITING_START_MESSAGE_STATE) {
             L\crawlLog("Sleeping before sending restart message other process");
             sleep(2 * C\QUEUE_SLEEP_TIME);
@@ -595,6 +600,7 @@ class QueueServer implements CrawlConstants, Join
             $crawl_params[self::STATUS] = "RESUME_CRAWL";
             $crawl_params[self::CRAWL_TIME] = $this->crawl_time;
             $crawl_params[self::CRAWL_TYPE] = $this->crawl_type;
+            $crawl_params[self::REPEAT_TYPE] = $this->repeat_type;
             $crawl_params[self::CHANNEL] = $this->channel;
             $info_string = serialize($crawl_params);
             $process_message_file = C\CRAWL_DIR . "/schedules/" .
@@ -737,7 +743,7 @@ class QueueServer implements CrawlConstants, Join
                 $max_links = max(C\MAX_LINKS_PER_PAGE, C\MAX_LINKS_PER_SITEMAP);
                 if ($count < C\NUM_URLS_QUEUE_RAM -
                     C\SEEN_URLS_BEFORE_UPDATE_SCHEDULER * $max_links) {
-                    $info = $this->processQueueUrls();
+                    $this->processQueueUrls();
                 }
                 if ($count > 0) {
                     $top = $this->web_queue->peekQueue();
@@ -758,7 +764,7 @@ class QueueServer implements CrawlConstants, Join
                         been taken by some fetcher
                      */
                     if (!file_exists(
-                        C\CRAWL_DIR."/schedules/" . self::schedule_name.
+                        C\CRAWL_DIR . "/schedules/" . self::schedule_name.
                         $this->crawl_time . ".txt")) {
                             $this->produceFetchBatch();
                     }
@@ -984,7 +990,7 @@ class QueueServer implements CrawlConstants, Join
                 break;
                 case "RESUME_CRAWL":
                     if (isset($info[self::CRAWL_TIME]) &&
-                        (file_exists(C\CRAWL_DIR.'/cache/'.
+                        (file_exists(C\CRAWL_DIR . '/cache/'.
                         self::queue_base_name . $info[self::CRAWL_TIME]) ||
                         $info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) ) {
                         if ($old_info[self::STATUS] == self::CONTINUE_STATE) {
@@ -1300,8 +1306,6 @@ class QueueServer implements CrawlConstants, Join
         $this->waiting_hosts = [];
         $this->initializeWebQueue();
         $this->initializeIndexBundle($info, $try_to_set_from_old_index);
-        $info[self::STATUS] = self::CONTINUE_STATE;
-        return $info;
     }
     /**
      * Function used to set up an indexer's IndexArchiveBundle or
@@ -1320,7 +1324,7 @@ class QueueServer implements CrawlConstants, Join
         if ($try_to_set_from_old_index === null) {
             $try_to_set_from_old_index = array_keys(self::$info_parameter_map);
         }
-        if(empty($this->repeat_type) || $this->repeat_type < 0) {
+        if(empty($this->repeat_type) || $this->repeat_type <= 0) {
             $class_name = C\NS_LIB . "IndexArchiveBundle";
             $dir = C\CRAWL_DIR . '/cache/' . self::index_data_base_name .
                 $this->crawl_time;
@@ -1351,7 +1355,7 @@ class QueueServer implements CrawlConstants, Join
                    (might take a while if merging dictionary)
                  */
                 $this->writeCrawlStatus($sites);
-            } else if (!empty($this->repeat_type) && $this->repeat_type >= 0) {
+            } else if (!empty($this->repeat_type) && $this->repeat_type > 0) {
                 $this->index_archive = new $class_name($dir, false,
                     serialize($info), C\NUM_DOCS_PER_GENERATION,
                     $this->repeat_type);
@@ -2087,15 +2091,11 @@ class QueueServer implements CrawlConstants, Join
      * Checks for a new crawl file or a schedule data for the current crawl and
      * if such a exists then processes its contents adding the relevant urls to
      * the priority queue
-     *
-     * @return array info array with continue status
      */
     public function processQueueUrls()
     {
         L\crawlLog("Scheduler Start checking for new URLs data memory usage: ".
             memory_get_usage());
-        $info = [];
-        $info[self::STATUS] = self::CONTINUE_STATE;
         $start_schedule_filename = C\CRAWL_DIR . "/schedules/" .
             $this->channel . "-" . self::schedule_start_name;
         if (file_exists($start_schedule_filename)) {
@@ -2107,13 +2107,11 @@ class QueueServer implements CrawlConstants, Join
             L\crawlLog("Scheduler Start schedule urls" .
                 $start_schedule_filename);
             $this->processDataArchive($start_schedule_filename);
-            return $info;
         }
         $schedule_dir = C\CRAWL_DIR."/schedules/" .
             self::schedule_data_base_name . $this->crawl_time;
         $this->processDataFile($schedule_dir, "processDataArchive");
         L\crawlLog("done.");
-        return $info;
     }
     /**
      * Process a file of to-crawl urls adding to or adjusting the weight in
diff --git a/src/library/BloomFilterBundle.php b/src/library/BloomFilterBundle.php
index 04ae13dc0..eb36e302b 100644
--- a/src/library/BloomFilterBundle.php
+++ b/src/library/BloomFilterBundle.php
@@ -151,7 +151,7 @@ class BloomFilterBundle
             }
             for ($j = 0; $j < $count; $j++) {
                 if ($field_names === null) {
-                    $tmp = $arr[$j];
+                    $tmp = & $arr[$j];
                     if ($tmp !== false && $tmp_filter->contains($tmp)) {
                     /*
                         We deliberately don't try to add anything that has
@@ -164,7 +164,7 @@ class BloomFilterBundle
                     }
                 } else { //now do the same strategy for the array of fields case
                     foreach ($field_names as $field_name) {
-                        $tmp = $arr[$j][$field_name];
+                        $tmp = & $arr[$j][$field_name];
                         if ($tmp !== false && $tmp_filter->contains($tmp)) {
                             unset($arr[$j]);
                             break;
ViewGit