Restrict mirroring only to serial queries to make work correctly, a=chris

Chris Pollett [2019-06-13 16:Jun:th]
Restrict mirroring only to serial queries to make work correctly, a=chris
Filename
src/configs/Config.php
src/controllers/SearchController.php
src/controllers/components/SystemComponent.php
src/executables/Mirror.php
src/models/Model.php
src/models/PhraseModel.php
src/views/elements/ManagemachinesElement.php
diff --git a/src/configs/Config.php b/src/configs/Config.php
index 25a6ca070..233bf16b9 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -261,10 +261,10 @@ ini_set('pcre.backtrack_limit', 1000000);
 nsconddefine("BASE_DIR", str_replace("\\", "/", realpath(__DIR__ ."/../")));
 nsconddefine("PARENT_DIR",  substr(BASE_DIR, 0, -strlen("/src")));
 nsconddefine("TEST_DIR",  PARENT_DIR . '/tests');
-if (file_exists(BASE_DIR."/configs/LocalConfig.php")) {
+if (file_exists(BASE_DIR . "/configs/LocalConfig.php")) {
     /** Include any locally specified defines (could use as an alternative
         way to set work directory) */
-    require_once(BASE_DIR."/configs/LocalConfig.php");
+    require_once(BASE_DIR . "/configs/LocalConfig.php");
 }
 initializeBaseUrlAndCurrentWorkingDirectory();
 /** Yioop Namespace*/
@@ -496,15 +496,15 @@ if (file_exists(WORK_DIRECTORY . PROFILE_FILE_NAME)) {
     /** BM25F weight for other text within links to a doc*/
     nsdefine('LINK_WEIGHT', 2);
     /**
-        If that many exist, the minimum number of results to get
-        and group before trying to compute the top x (say 10) results
+     * If that many exist, the minimum number of results to get
+     * and group before trying to compute the top x (say 10) results
      */
     nsdefine('MIN_RESULTS_TO_GROUP', 200);
     /**
-        For a given number of search results total to return (total_num)
-        server_alpha*total_num/num_servers will be returned any a given
-        queue server machine
-    */
+     * For a given number of search results total to return (total_num)
+     * server_alpha*total_num/num_servers will be returned any a given
+     * queue server machine
+     */
     nsdefine('SERVER_ALPHA', 1.6);
     nsdefine('BACKGROUND_COLOR', "#FFFFFF");
     nsdefine('FOREGROUND_COLOR', "#FFFFFF");
@@ -871,7 +871,7 @@ nsconddefine('QUEUE_SLEEP_TIME', 5);
 nsconddefine('MIRROR_SYNC_FREQUENCY', ONE_HOUR);
 /** How often mirror script tries to notify machine it is mirroring that it
 is still alive*/
-nsconddefine('MIRROR_NOTIFY_FREQUENCY', 5 * ONE_MINUTE);
+nsconddefine('MIRROR_NOTIFY_FREQUENCY', ONE_MINUTE);
 /** Max time before dirty index (queue_server) and
     filters (fetcher) will be force saved in seconds*/
 nsconddefine('FORCE_SAVE_TIME', ONE_HOUR);
@@ -884,8 +884,8 @@ nsconddefine('MAX_QUERY_LEN', 4096);
 /** whether to use question answering system */
 nsconddefine('ENABLE_QUESTION_ANSWERING', true);
 /** If true, when processing query see if subsets of terms in query form a
-    known phrase and if so do lookup with that rather than do a conjunctive
-    query over those terms
+ *  known phrase and if so do lookup with that rather than do a conjunctive
+ *  query over those terms
  */
 nsconddefine("SUFFIX_PHRASES", false);
 /** Number of words until to switch from bag of words to phrase lookup
diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php
index 69616e7f0..3798f8562 100755
--- a/src/controllers/SearchController.php
+++ b/src/controllers/SearchController.php
@@ -163,6 +163,11 @@ class SearchController extends Controller implements CrawlConstants
             $data['TOTAL_TIME'] = L\changeInMicrotime(
                 $_SERVER["REQUEST_TIME_FLOAT"]);
             if ($view == "serial") {
+                if (isset($_REQUEST['mirror']) &&
+                    $_REQUEST['mirror'] == "true") {
+                    // mark if we are a mirror -- not making use of yet
+                    $data['MIRROR'] = true;
+                }
                 $data = serialize($data);
                 if (empty(ini_get('zlib.output_compression')) &&
                     !$this->web_site->isCli()) {
@@ -677,6 +682,7 @@ class SearchController extends Controller implements CrawlConstants
         }
     }
     /**
+     * Only used for serial network queries
      * Used to check if there are any mirrors of the current server.
      * If so, it tries to distribute the query requests randomly amongst
      * the mirrors and itself. To determine if there are mirrors of the
@@ -687,6 +693,9 @@ class SearchController extends Controller implements CrawlConstants
      */
     public function mirrorHandle()
     {
+        if (empty($_REQUEST['f']) || $_REQUEST['f'] != 'serial') {
+            return false;
+        }
         $mirror_table_name = C\CRAWL_DIR . "/" . self::mirror_table_name;
         $handled = false;
         if (file_exists($mirror_table_name)) {
@@ -699,7 +708,11 @@ class SearchController extends Controller implements CrawlConstants
                     if ($entry[0] == "::1") {
                         $entry[0] = "[::1]";
                     }
-                    $request = "http://" . $entry[0] . $entry[1];
+                    /* assume mirror uses same scheme as machine mirroring
+                     * i.e., http or https
+                     */
+                    $request = UrlParser::getScheme(C\BASE_URL) . '://'.
+                        $entry[0] . $entry[1];
                     $mirrors[] = $request;
                 }
             }
@@ -710,8 +723,23 @@ class SearchController extends Controller implements CrawlConstants
                 // if ==$count, we'll let the current machine handle it
                 if ($rand < $count) {
                     $request = $mirrors[$rand] . "?" .
-                        $_SERVER["QUERY_STRING"] . "&network=false";
-                    echo FetchUrl::getPage($request);
+                        $_SERVER["QUERY_STRING"] . "&mirror=true";
+                    if (strpos($_SERVER["QUERY_STRING"], "network=") === false){
+                        $request .= "&network=false";
+                    }
+                    if (empty(ini_get('zlib.output_compression')) &&
+                        !$this->web_site->isCli()) {
+                        ob_start("ob_gzhandler");
+                        $this->web_site->header("Content-Type: text/plain");
+                        e(FetchUrl::getPage($request));
+                        ob_end_flush();
+                    } else {
+                        $this->web_site->header("Content-Type: text/plain");
+                        $this->web_site->header("Content-Length: " .
+                            strlen($data));
+                        e(FetchUrl::getPage($request));
+                        flush();
+                    }
                     $handled = true;
                 }
             }
diff --git a/src/controllers/components/SystemComponent.php b/src/controllers/components/SystemComponent.php
index 3e7f09b16..c91085dcb 100755
--- a/src/controllers/components/SystemComponent.php
+++ b/src/controllers/components/SystemComponent.php
@@ -88,6 +88,7 @@ class SystemComponent extends Component
         $machine_names = $machine_model->getQueueServerNames();
         $data['PARENT_MACHINES'] = array_combine($machine_names,
             $machine_names);
+        $data['PARENT'] = $machine_names[0];
         $tmp = tl('system_component_select_machine');
         if (isset($_REQUEST['channel']) && $_REQUEST['channel'] == -1) {
             $_REQUEST['num_fetchers'] = 0;
diff --git a/src/executables/Mirror.php b/src/executables/Mirror.php
index f8d5ad582..167f1bb3e 100644
--- a/src/executables/Mirror.php
+++ b/src/executables/Mirror.php
@@ -223,7 +223,7 @@ class Mirror implements CrawlConstants
         $session = md5($time . C\AUTH_KEY);
         $write_sync_time = true;
         $request =
-            $server.
+            $server .
             "?c=resource&time=$time&session=$session" .
             "&robot_instance=" . C\ROBOT_INSTANCE . "&machine_uri=" .
             C\WEB_URI . "&last_sync=" . $this->last_sync;
diff --git a/src/models/Model.php b/src/models/Model.php
index 24226bc24..a2f00bfaf 100755
--- a/src/models/Model.php
+++ b/src/models/Model.php
@@ -439,6 +439,15 @@ class Model implements CrawlConstants
                 return false;
             }
         }
+        if (is_array($machine_urls) && count($machine_urls) == 1 &&
+            C\NAME_SERVER == $machine_urls[0]) {
+            $mirror_table_name = C\CRAWL_DIR . "/" . self::mirror_table_name;
+            if (file_exists($mirror_table_name) &&
+                time() - filemtime($mirror_table_name) <
+                2 * C\MIRROR_NOTIFY_FREQUENCY) {
+                return false;
+            }
+        }
         return true;
     }
     /**
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index 3d65200b5..ec3db4911 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -1415,6 +1415,12 @@ class PhraseModel extends ParallelModel
             $lookup_queue_servers[] = C\NAME_SERVER;
                 //name server might still have news
         }
+        if (count($lookup_queue_servers) == 1 &&
+            $lookup_queue_servers[0] == C\NAME_SERVER &&
+            C\BASE_URL == C\NAME_SERVER) {
+            // for now only do mirror non-lookup offsets
+            $lookup_queue_servers = [];
+        }
         /* look up items (items we have a link summary for, but not doc
             summary)*/
         $summaries = $this->getCrawlItems($lookups, $lookup_queue_servers,
@@ -1585,7 +1591,7 @@ class PhraseModel extends ParallelModel
                 substr(L\crawlHash("site:doc"), 0, 9)];
             if ($save_timestamp_name != "") {
                 // used for archive crawls of crawl mixes
-                $save_file = C\CRAWL_DIR.'/schedules/' . self::save_point .
+                $save_file = C\CRAWL_DIR . '/schedules/' . self::save_point .
                     $save_timestamp_name . ".txt";
                 if (file_exists($save_file)) {
                     $save_point =
diff --git a/src/views/elements/ManagemachinesElement.php b/src/views/elements/ManagemachinesElement.php
index 2715927d0..0ece30e6a 100644
--- a/src/views/elements/ManagemachinesElement.php
+++ b/src/views/elements/ManagemachinesElement.php
@@ -83,7 +83,7 @@ class ManagemachinesElement extends Element
             tl('managemachines_element_parent_name')?></label></th>
             <td><?php $this->view->helper("options")->render(
                 "parent-machine-name", "parent",
-                $data['PARENT_MACHINES'], 0); ?></td></tr>
+                $data['PARENT_MACHINES'], $data['PARENT']); ?></td></tr>
         <tr id="m2"><th><label for="fetcher-number"><?=
             tl('managemachines_element_num_fetchers')?></label></th><td>
             <?php $this->view->helper("options")->render("fetcher-number",
ViewGit