Tweaks to how server name computed, tweaks to processIndexArchive to make blocked case avoid recomputation of attempted file to process, also slightly better error handling, a=chris

Chris Pollett [2019-07-10 21:Jul:th]
Tweaks to how server name computed, tweaks to processIndexArchive to make blocked case avoid recomputation of attempted file to process, also slightly better error handling, a=chris
Filename
src/configs/Config.php
src/controllers/FetchController.php
src/executables/QueueServer.php
src/locale/en_US/configure.ini
src/models/SearchfiltersModel.php
diff --git a/src/configs/Config.php b/src/configs/Config.php
index aa2be2f1c..4fe1275b3 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -172,7 +172,7 @@ nsdefine('MIN_AD_VERSION', 36);
 nsdefine('RESOURCES_WIKI_VERSION', 5);
 /**
  * nsdefine's the BASE_URL constant for this script
- * if run from the command line as part of index.php HTTP server scrip
+ * if run from the command line as part of index.php HTTP server script
  * set the current working directory as well
  */
 function initializeBaseUrlAndCurrentWorkingDirectory()
@@ -224,14 +224,17 @@ function initializeBaseUrlAndCurrentWorkingDirectory()
     $port = ( ($http == "http://" && ($server_port != 80) ||
         ($http == "https://" && $server_port != 443))) ?
         ":" . $server_port : "";
-    if (nsdefined('SERVER_CONTEXT')) {;
-            $context = SERVER_CONTEXT;
-            if (!empty($context['SERVER_NAME'])) {
-                $_SERVER['SERVER_NAME'] = $context['SERVER_NAME'];
-            }
+    if (nsdefined('SERVER_CONTEXT')) {
+        $context = SERVER_CONTEXT;
+        if (!empty($context['SERVER_NAME'])) {
+            $_SERVER['SERVER_NAME'] = $context['SERVER_NAME'];
         }
+    }
     $server_name = isset($_SERVER['SERVER_NAME']) ? $_SERVER['SERVER_NAME'] :
         "localhost";
+    if (nsdefined('NAME_SERVER') && NAME_SERVER == "www." . $server_name) {
+        $server_name = NAME_SERVER;
+    }
     if (strpos($server_name, ":") !== false && $server_name[0] != '[') {
         $server_name = "[$server_name]"; //guessing ipv6 address
     }
@@ -266,7 +269,6 @@ if (file_exists(BASE_DIR . "/configs/LocalConfig.php")) {
         way to set work directory) */
     require_once(BASE_DIR . "/configs/LocalConfig.php");
 }
-initializeBaseUrlAndCurrentWorkingDirectory();
 /** Yioop Namespace*/
 nsdefine('NS', "seekquarry\\yioop\\");
 /** controllers sub-namespace */
@@ -346,7 +348,9 @@ nsconddefine('MAINTENANCE_MESSAGE', <<<EOD
 This Yioop! installation is undergoing maintenance, please come back later!
 EOD
 );
-if (MAINTENANCE_MODE && $_SERVER["SERVER_ADDR"] != $_SERVER["REMOTE_ADDR"]) {
+if (MAINTENANCE_MODE && !empty($_SERVER["SERVER_ADDR"]) &&
+    !empty($_SERVER["REMOTE_ADDR"]) &&
+    $_SERVER["SERVER_ADDR"] != $_SERVER["REMOTE_ADDR"]) {
     echo MAINTENANCE_MESSAGE;
     exit();
 }
@@ -408,6 +412,7 @@ if (file_exists(WORK_DIRECTORY . PROFILE_FILE_NAME)) {
         file_put_contents(WORK_DIRECTORY . PROFILE_FILE_NAME, $new_profile);
     }
     require_once WORK_DIRECTORY . PROFILE_FILE_NAME;
+    initializeBaseUrlAndCurrentWorkingDirectory();
     nsdefine('PROFILE', true);
     nsdefine('CRAWL_DIR', WORK_DIRECTORY);
     if (is_dir(APP_DIR."/locale")) {
@@ -430,6 +435,7 @@ if (file_exists(WORK_DIRECTORY . PROFILE_FILE_NAME)) {
         nsdefine("FIX_NAME_SERVER", true);
     }
 } else {
+    initializeBaseUrlAndCurrentWorkingDirectory();
     if ((!isset( $_SERVER['SERVER_NAME']) ||
         $_SERVER['SERVER_NAME']!=='localhost')
         && !nsdefined("NO_LOCAL_CHECK") && !nsdefined("WORK_DIRECTORY")
diff --git a/src/controllers/FetchController.php b/src/controllers/FetchController.php
index 6c1d2d0a8..beb10ac37 100755
--- a/src/controllers/FetchController.php
+++ b/src/controllers/FetchController.php
@@ -299,7 +299,7 @@ class FetchController extends Controller implements CrawlConstants
             if (file_exists($this->crawl_status_file_name)) {
                 $crawl_status = unserialize(file_get_contents(
                     $this->crawl_status_file_name));
-                if ($crawl_status['CRAWL_TIME'] != 0) {
+                if (!empty($crawl_status['CRAWL_TIME'])) {
                     $restart = false;
                 }
             }
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 3daea00e0..30de04c36 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -1773,7 +1773,7 @@ class QueueServer implements CrawlConstants, Join
         static $blocked = false;
         if ($blocking && $blocked) {
             L\crawlLog("Indexer waiting for merge tiers to ".
-                "complete before write partition. B");
+                "complete before write partition.");
             return;
         }
         if (!$blocking) {
@@ -1785,10 +1785,10 @@ class QueueServer implements CrawlConstants, Join
         L\crawlLog("Indexer: Processing index data in $file...");
         $start_time = microtime(true);
         $start_total_time = microtime(true);
-        $pre_sites = L\webdecode(file_get_contents($file));
-        $len_urls = L\unpackInt(substr($pre_sites, 0, 4));
-        $seen_urls_string = substr($pre_sites, 4, $len_urls);
-        $pre_sites = substr($pre_sites, 4 + $len_urls);
+        $pre_sites_and_index = L\webdecode(file_get_contents($file));
+        $len_urls = L\unpackInt(substr($pre_sites_and_index, 0, 4));
+        $seen_urls_string = substr($pre_sites_and_index, 4, $len_urls);
+        $pre_sites_and_index = substr($pre_sites_and_index, 4 + $len_urls);
         $sites[self::SEEN_URLS] = [];
         $pos = 0;
         $num = 0;
@@ -1828,12 +1828,30 @@ class QueueServer implements CrawlConstants, Join
             return;
         }
         L\crawlLog("A. Indexer Load SEEN_URLS. Memory usage:".
-            memory_get_usage() ." time: ".L\changeInMicrotime($start_time));
+            memory_get_usage() ." time: " . L\changeInMicrotime($start_time));
         $sites[self::INVERTED_INDEX] = IndexShard::load("fetcher_shard",
-            $pre_sites);
-        unset($pre_sites);
+            $pre_sites_and_index);
+        if (empty($sites[self::INVERTED_INDEX])) {
+            L\crawlLog("Index data file inverted index empty or corrupt.");
+            L\crawlLog("Indexer Done Index Processing File: $file. " .
+                "Total time: " . L\changeInMicrotime($start_total_time));
+            unlink($file);
+            return;
+        }
+        $index_shard = $sites[self::INVERTED_INDEX];
+        $generation = $this->index_archive->initGenerationToAdd(
+                $index_shard->num_docs, $this, $blocking);
+        if ($generation == -1) {
+            L\crawlLog("Indexer waiting for merge tiers to ".
+                "complete before write partition. A");
+            $blocked = true;
+            // In this case if we block, will end up reprocess file
+            return; /* if don't return here can process rest of
+                method */
+        }
+        unset($pre_sites_and_index);
         L\crawlLog("B. Indexer Load Sent shard. Memory usage:".
-            memory_get_usage() ." time: ".(L\changeInMicrotime($start_time)));
+            memory_get_usage() ." time: " . (L\changeInMicrotime($start_time)));
         $start_time = microtime(true);
         //do deduplication of summaries
         if (isset($sites[self::SEEN_URLS]) &&
@@ -1860,7 +1878,7 @@ class QueueServer implements CrawlConstants, Join
                     L\crawlHash($link_url_parts[1], true)
                     . L\crawlHash($seen_sites[$i][self::URL], true)
                     . $reftype . substr(L\crawlHash(
-                      UrlParser::getHost($link_url_parts[5]) . "/", true), 1);
+                    UrlParser::getHost($link_url_parts[5]) . "/", true), 1);
                 $seen_sites[$i][self::IS_DOC] = false;
             } else {
                 $seen_sites[$i][self::IS_DOC] = true;
@@ -1872,40 +1890,29 @@ class QueueServer implements CrawlConstants, Join
                 $recent_urls_count++;
             }
         }
-        if (isset($sites[self::INVERTED_INDEX])) {
-            $index_shard = $sites[self::INVERTED_INDEX];
-            $generation = $this->index_archive->initGenerationToAdd(
-                    $index_shard->num_docs, $this, $blocking);
-            if ($generation == -1) {
-                L\crawlLog("Indexer waiting for merge tiers to ".
-                    "complete before write partition. A");
-                $blocked = true;
-                return;
-            }
-            $summary_offsets = [];
-            if (isset($seen_sites)) {
-                $this->index_archive->addPages(
-                    $generation, self::SUMMARY_OFFSET, $seen_sites,
-                    $visited_urls_count);
-                foreach ($seen_sites as $site) {
-                    if ($site[self::IS_DOC]) { // so not link
-                        $site_url = str_replace('|', "%7C", $site[self::URL]);
-                        $host = UrlParser::getHost($site_url);
-                        $hash = L\crawlHash($site_url, true).
-                            $site[self::HASH] .
-                            "d". substr(L\crawlHash($host."/", true), 1);
-                    } else {
-                        $hash = $site[self::HASH_URL];
-                    }
-                    $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET];
+        $summary_offsets = [];
+        if (!empty($seen_sites)) {
+            $this->index_archive->addPages($generation, self::SUMMARY_OFFSET,
+                $seen_sites, $visited_urls_count);
+            foreach ($seen_sites as $site) {
+                if ($site[self::IS_DOC]) { // so not link
+                    $site_url = str_replace('|', "%7C", $site[self::URL]);
+                    $host = UrlParser::getHost($site_url);
+                    $hash = L\crawlHash($site_url, true) . $site[self::HASH] .
+                        "d". substr(L\crawlHash($host . "/", true), 1);
+                } else {
+                    $hash = $site[self::HASH_URL];
                 }
-                unset($seen_sites);
+                $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET];
             }
-            L\crawlLog("C. Indexer init local shard, store ".
-                "Summaries memory usage: ". memory_get_usage() .
-                " time: " . L\changeInMicrotime($start_time));
-            $start_time = microtime(true);
-            // added summary offset info to inverted index data
+            unset($seen_sites);
+        }
+        L\crawlLog("C. Indexer init local shard, store " .
+            "Summaries memory usage: ". memory_get_usage() .
+            " time: " . L\changeInMicrotime($start_time));
+        $start_time = microtime(true);
+        // added summary offset info to inverted index data
+        if (!empty($summary_offsets)) {
             $index_shard->changeDocumentOffsets($summary_offsets);
             L\crawlLog("D. Indexer Update shard offsets. Memory usage: ".
                 memory_get_usage() . " time: " .
@@ -1913,16 +1920,16 @@ class QueueServer implements CrawlConstants, Join
             $start_time = microtime(true);
             $this->index_archive->addIndexData($index_shard);
             $this->index_dirty = true;
+            L\crawlLog("E. Indexer Add index shard. Memory usage: ".
+                memory_get_usage() . " time: " .
+                L\changeInMicrotime($start_time));
         }
-        L\crawlLog("E. Indexer Add index shard. Memory usage: ".
-            memory_get_usage() . " time: " .
-            L\changeInMicrotime($start_time));
-        L\crawlLog("Indexer Done Index Processing File: $file. Total time: ".
-            L\changeInMicrotime($start_total_time));
         if (isset($recent_urls)) {
             $sites[self::RECENT_URLS] = $recent_urls;
             $this->writeCrawlStatus($sites);
         }
+        L\crawlLog("Indexer Done Index Processing File: $file. Total time: ".
+            L\changeInMicrotime($start_total_time));
         if (file_exists($file)) {
             //Haven't tracked down yet, but can try to delete twice giving warn
             unlink($file);
diff --git a/src/locale/en_US/configure.ini b/src/locale/en_US/configure.ini
index 8939419f3..8b6be01b0 100644
--- a/src/locale/en_US/configure.ini
+++ b/src/locale/en_US/configure.ini
@@ -1444,7 +1444,7 @@ trending_element_term = "Term"
 trending_element_score = "Score"
 trending_element_date = "Computed %s"
 trending_element_hourly_trend = "Hourly Trend Score for &#039;%s&#039; for Last Day"
-trending_element_daily_trend = "Dailly Trend Score for &#039;%s&#039; for Last Week"
+trending_element_daily_trend = "Daily Trend Score for &#039;%s&#039; for Last Week"
 ;
 ; ManageusersElement.php
 manageusers_element_users = "User List"
diff --git a/src/models/SearchfiltersModel.php b/src/models/SearchfiltersModel.php
index 04fbec6da..bcd9a893f 100644
--- a/src/models/SearchfiltersModel.php
+++ b/src/models/SearchfiltersModel.php
@@ -60,8 +60,7 @@ class SearchfiltersModel extends Model
         $this->dir_name = C\CRAWL_DIR . "/search_filters";
         if (!file_exists(C\CRAWL_DIR . "/search_filters")) {
             mkdir($this->dir_name);
-            $this->db->setWorldPermissionsRecursive(
-                $this->dir_name, true);
+            $this->db->setWorldPermissionsRecursive($this->dir_name, true);
         }
     }
     /**
ViewGit