Tweaks to how caching handling by IndexManager to try to get feeds to work better when acting as own web server, a=chris

Chris Pollett [2018-04-19 20:Apr:th]
Tweaks to how caching handling by IndexManager to try to get feeds to work better when acting as own web server, a=chris
Filename
src/configs/Config.php
src/controllers/SearchController.php
src/controllers/components/SystemComponent.php
src/library/FileCache.php
src/library/IndexArchiveBundle.php
src/library/IndexManager.php
src/library/WebSite.php
src/models/PhraseModel.php
diff --git a/src/configs/Config.php b/src/configs/Config.php
index 92298f9e6..afed87782 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -437,6 +437,7 @@ if (file_exists(WORK_DIRECTORY . PROFILE_FILE_NAME)) {
     nsdefine('API_ACCESS', true);
     nsdefine('REGISTRATION_TYPE', 'disable_registration');
     nsdefine('USE_MAIL_PHP', true);
+    nsdefine('MAIL_SENDER', '');
     nsdefine('MAIL_SERVER', '');
     nsdefine('MAIL_PORT', '');
     nsdefine('MAIL_USERNAME', '');
diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php
index c0550cdef..e64726e22 100755
--- a/src/controllers/SearchController.php
+++ b/src/controllers/SearchController.php
@@ -87,7 +87,7 @@ class SearchController extends Controller implements CrawlConstants
             } else if (C\USE_FILECACHE) {
                 $phrase_model::$cache =
                     new FileCache(C\WORK_DIRECTORY . "/cache/queries",
-                    $web_site);
+                    $this->web_site);
                 $_SERVER["USE_CACHE"] = true;
             } else {
                 $_SERVER["USE_CACHE"] = false;
diff --git a/src/controllers/components/SystemComponent.php b/src/controllers/components/SystemComponent.php
index 04f4c3862..c098b84d7 100755
--- a/src/controllers/components/SystemComponent.php
+++ b/src/controllers/components/SystemComponent.php
@@ -553,9 +553,11 @@ class SystemComponent extends Component
         $data["ELEMENT"] = "serversettings";
         switch ($arg) {
             case "clearCache":
-                $cache = new L\FileCache(C\WORK_DIRECTORY . "/cache/queries");
-                $cache->clear();
-                $parent->web_site->clearFileCache();
+                L\IndexManager::clearCache();
+                $phrase_model = $parent->model("phrase");
+                if (!empty($phrase_model::$cache)) {
+                    $phrase_model::$cache->clear();
+                }
                 return $parent->redirectWithMessage(
                     tl('system_component_cache_cleared'));
             break;
diff --git a/src/library/FileCache.php b/src/library/FileCache.php
index ac5e1bf80..a76428501 100644
--- a/src/library/FileCache.php
+++ b/src/library/FileCache.php
@@ -75,7 +75,7 @@ class FileCache
         if (!is_dir($this->dir_name)) {
             mkdir($this->dir_name);
             chmod($this->dir_name, 0777);
-            $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS)."Manager";
+            $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager";
             $db = new $db_class();
             $db->setWorldPermissionsRecursive($this->dir_name, true);
         }
@@ -139,11 +139,6 @@ class FileCache
     /**
      * Stores in the cache a key-value pair
      *
-     * Only when a key is set is there a check for whether to invalidate
-     * a cache bin. It is deleted as invalid if the following two conditions
-     * both hold:
-     * The last time it was expired is more than SECONDS_IN_A_BIN seconds ago,
-     * and the number of cache items is more than self::MAX_FILES_IN_A_BIN.
      *
      * @param string $key to associate with value
      * @param mixed $value to store
@@ -157,7 +152,7 @@ class FileCache
             chmod($checksum_dir, 0777);
         }
         $cache_file = "$checksum_dir/c".webencode($key);
-        $this->updateCache($key, true);
+        $this->updateCache($key);
         $this->filePutContents($cache_file, serialize($value));
     }
     /**
@@ -181,6 +176,9 @@ class FileCache
      */
     public function clear()
     {
+        if (!empty($this->web_site)) {
+            $this->web_site->clearFileCache();
+        }
         if (is_dir($this->dir_name)) {
             $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS)."Manager";
             $db = new $db_class();
@@ -190,7 +188,7 @@ class FileCache
     /**
      *
      */
-    protected function updateCache($key, $is_write = false)
+    protected function updateCache($key)
     {
         $checksum_block = $this->checksum($key);
         $checksum_dir = $this->dir_name . "/$checksum_block";
@@ -241,7 +239,7 @@ class FileCache
                     unlink($delete_file);
                 }
             }
-            if ($now - $data["TIME"] > C\ONE_HOUR) {
+            if ($now - $data["TIME"] > C\MIN_QUERY_CACHE_TIME) {
                 $in_cache_files = array_flip(glob($checksum_dir . "/c*"));
                 $keys = array_keys($data['UNMARKED']);
                 foreach ($keys as $check_key) {
diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php
index aef58de05..6f8f36c81 100644
--- a/src/library/IndexArchiveBundle.php
+++ b/src/library/IndexArchiveBundle.php
@@ -140,21 +140,21 @@ class IndexArchiveBundle implements CrawlConstants
         $is_dir = is_dir($this->dir_name);
         if (!$is_dir && !$read_only_archive) {
             mkdir($this->dir_name);
-            mkdir($this->dir_name."/posting_doc_shards");
+            mkdir($this->dir_name . "/posting_doc_shards");
         } else if (!$is_dir) {
             return false;
         } else {
             $index_archive_exists = true;
         }
-        if (file_exists($this->dir_name."/generation.txt")) {
+        if (file_exists($this->dir_name . "/generation.txt")) {
             $this->generation_info = unserialize(
-                file_get_contents($this->dir_name."/generation.txt"));
+                file_get_contents($this->dir_name . "/generation.txt"));
         } else if (!$read_only_archive) {
             $this->generation_info['ACTIVE'] = 0;
-            file_put_contents($this->dir_name."/generation.txt",
+            file_put_contents($this->dir_name . "/generation.txt",
                 serialize($this->generation_info));
         }
-        $this->summaries = new WebArchiveBundle($dir_name."/summaries",
+        $this->summaries = new WebArchiveBundle($dir_name . "/summaries",
             $read_only_archive, -1, $description);
         if (!$read_only_archive) {
             $this->summaries->initCountIfNotExists("VISITED_URLS_COUNT");
@@ -164,7 +164,7 @@ class IndexArchiveBundle implements CrawlConstants
             $this->version = $this->summaries->version;
         }
         $this->num_docs_per_generation = $num_docs_per_generation;
-        $this->dictionary = new IndexDictionary($this->dir_name."/dictionary",
+        $this->dictionary = new IndexDictionary($this->dir_name . "/dictionary",
             $this);
     }
     /**
@@ -470,4 +470,4 @@ class IndexArchiveBundle implements CrawlConstants
     {
         return WebArchiveBundle::getParamModifiedTime($dir_name."/summaries");
     }
-}
\ No newline at end of file
+}
diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php
index 76f40fd70..ee7d6b9a3 100644
--- a/src/library/IndexManager.php
+++ b/src/library/IndexManager.php
@@ -35,7 +35,7 @@ use seekquarry\yioop\configs as C;
 /**
  * For crawlHash
  */
-require_once __DIR__."/Utility.php";
+require_once __DIR__ . "/Utility.php";
 /**
  * Class used to manage open IndexArchiveBundle's while performing
  * a query. Ensures an easy place to obtain references to these bundles
@@ -52,11 +52,13 @@ class IndexManager implements CrawlConstants
      */
     public static $indexes = [];
     /**
-     * Used to cache word lookup of posting list locations for a given
-     * index
-     * @var array
+     *
+     */
+    public static $index_times = [];
+    /**
+     *
      */
-    public static $dictionary = [];
+    const INDEX_CACHE_SIZE = 1000;
     /**
      * Returns a reference to the managed copy of an IndexArchiveBundle object
      * with a given timestamp or an IndexShard in the case where
@@ -68,12 +70,15 @@ class IndexManager implements CrawlConstants
     public static function getIndex($index_name)
     {
         $index_name = trim($index_name); //trim to fix postgres quirkiness
-        if (!isset(self::$indexes[$index_name])) {
+        if (empty(self::$indexes[$index_name]) ||
+            ($index_name == "feed" && !empty(self::$index_times["feed"]) &&
+            (time() - self::$index_times["feed"]) > C\MIN_QUERY_CACHE_TIME) ) {
             if ($index_name == "feed") {
                 $index_file = C\WORK_DIRECTORY."/feeds/index";
                 if (file_exists($index_file)) {
                     self::$indexes[$index_name] = new IndexShard(
                         $index_file, 0, C\NUM_DOCS_PER_GENERATION, true);
+                    self::$index_times["feed"]  = time();
                 } else {
                     return false;
                 }
@@ -86,10 +91,30 @@ class IndexManager implements CrawlConstants
                 }
                 self::$indexes[$index_name] = $tmp;
                 self::$indexes[$index_name]->setCurrentShard(0, true);
+                self::$index_times[$index_name] = time();
+            }
+            if (count(self::$indexes) > self::INDEX_CACHE_SIZE) {
+                $times = array_values(self::$index_times);
+                sort($times);
+                $oldest_third = $times[floor(count($times)/3)];
+                foreach (self::$index_times as $name => $time) {
+                    if ($time <= $oldest_third) {
+                        unset(self::$index_times[$name], self::$indexes[$name]);
+                    }
+                }
             }
         }
         return self::$indexes[$index_name];
     }
+    /**
+     *  Clears the static variables in which caches of read in indexes
+     *  and dictionary info is stored.
+     */
+    public static function clearCache()
+    {
+        self::$indexes = [];
+        self::$index_times = [];
+    }
     /**
      * Returns the version of the index, so that Yioop can determine
      * how to do word lookup.The only major change to the format was
@@ -112,7 +137,7 @@ class IndexManager implements CrawlConstants
         return 1;
     }
     /**
-     * Gets an array posting list positions for each shard in the
+     * Gets an array of posting list positions for each shard in the
      * bundle $index_name for the word id $hash
      *
      * @param string $index_name bundle to look $hash in
@@ -136,32 +161,7 @@ class IndexManager implements CrawlConstants
         $num_distinct_generations = -1, $with_remaining_total = false)
     {
         $id = "$index_name:$start_generation:$num_distinct_generations";
-        $index = IndexManager::getIndex($index_name);
-        if (!$index->dictionary) {
-            $tmp = [];
-            if ((!C\nsdefined('NO_FEEDS') || !C\NO_FEEDS)
-               && file_exists(C\WORK_DIRECTORY."/feeds/index")) {
-               //NO_FEEDS defined true in statistic_controller.php
-                $use_feeds = true;
-                $feed_shard = IndexManager::getIndex("feed");
-                $feed_info = $feed_shard->getWordInfo($hash, true, $shift,
-                    $mask);
-                if (is_array($feed_info)) {
-                    $tmp[-1] = [-1, $feed_info[0],
-                        $feed_info[1], $feed_info[2], $feed_info[3]];
-                }
-            }
-            if ($tmp == []) {
-                return ($with_remaining_total) ? [0, false] : false;
-            }
-            IndexManager::$dictionary[$id][$hash][$shift][$mask][$threshold] =
-                [$feed_info[3], $tmp];
-            return ($with_remaining_total) ?
-                IndexManager::$dictionary[$id][$hash][$shift][$mask][
-                    $threshold] :
-                IndexManager::$dictionary[$id][$hash][$shift][$mask][
-                    $threshold][1];
-        }
+        $index = self::getIndex($index_name);
         $len = strlen($mask);
         if ($len > 0) {
             $pre_hash = substr($hash, 0, 8) .
@@ -169,95 +169,38 @@ class IndexManager implements CrawlConstants
         } else {
             $pre_hash = $hash;
         }
-        if (!isset(IndexManager::$dictionary[$id][$hash][$shift][$mask][
-            $threshold])) {
-            $tmp = [];
-            $test_mask = "";
-            if (isset(IndexManager::$dictionary[$id][$pre_hash][
-                $shift])) {
-                foreach (IndexManager::$dictionary[$id][$pre_hash][
-                    $shift] as $test_mask => $data) {
-                    $mask_len = strlen($test_mask);
-                    if ($mask_len > $len) {continue; }
-                    $mask_found = true;
-                    for ($k = 0; $k < $mask_len; $k++) {
-                        if (ord($test_mask[$k]) > 0 &&
-                            $test_mask[$k] != $mask[$k]) {
-                            $mask_found = false;
-                            break;
-                        }
-                    }
-                    if ($mask_found && isset(
-                        IndexManager::$dictionary[$id][$pre_hash][
-                            $shift][$test_mask][$threshold]) ) {
-                        list($total, $info) =
-                            IndexManager::$dictionary[$id][$pre_hash
-                            ][$shift][$test_mask][$threshold];
-                        $out_info = [];
-                        foreach ($info as $record) {
-                            $rid = $record[4];
-                            $add_flag = true;
-                            if ($mask != "") {
-                               for ($k = 0; $k < $len; $k++) {
-                                    $loc = 8 + $k;
-                                    if (ord($mask[$k]) > 0 &&
-                                        isset($rid[$loc]) &&
-                                        $rid[$loc] != $hash[$loc]) {
-                                        $add_flag = false;
-                                        break;
-                                    }
-                                }
-                            }
-                            if ($add_flag) {
-                                $out_info[$record[0]] = $record;
-                            } else {
-                                if ($record[3] < $total) {
-                                    $total -= $record[3];
-                                }
-                            }
-                        }
-                        IndexManager::$dictionary[$id][$hash][$shift
-                           ][$mask] = [$total, $out_info];
-                        return ($with_remaining_total) ?
-                            IndexManager::$dictionary[$id][
-                            $hash][$shift][$mask] :
-                            IndexManager::$dictionary[$id][
-                            $hash][$shift][$mask][1];
-                    }
-                }
-            }
-            if ((!C\nsdefined('NO_FEEDS') || !C\NO_FEEDS) &&
-                $start_generation < 0
-                && file_exists(C\WORK_DIRECTORY."/feeds/index")) {
-                //NO_FEEDS defined true in statistic_controller.php
-                $use_feeds = true;
-                $feed_shard = IndexManager::getIndex("feed");
-                $feed_info = $feed_shard->getWordInfo($hash, true, $shift,
-                    $mask);
-                if (is_array($feed_info)) {
-                    $tmp[-1] = [-1, $feed_info[0],
-                        $feed_info[1], $feed_info[2], $feed_info[3]];
-                }
+        $tmp = [];
+        $test_mask = "";
+        if ((!C\nsdefined('NO_FEEDS') || !C\NO_FEEDS) &&
+            $start_generation < 0
+            && file_exists(C\WORK_DIRECTORY . "/feeds/index")) {
+            //NO_FEEDS defined true in statistic_controller.php
+            $use_feeds = true;
+            $feed_shard = self::getIndex("feed");
+            $feed_info = $feed_shard->getWordInfo($hash, true, $shift,
+                $mask);
+            if (is_array($feed_info)) {
+                $tmp[-1] = [-1, $feed_info[0],
+                    $feed_info[1], $feed_info[2], $feed_info[3]];
             }
+        }
+        if (!empty($index->dictionary)) {
             $pre_info =
-                $index->dictionary->getWordInfo($hash, true, $shift, $mask,
-                $threshold, $start_generation, $num_distinct_generations, true);
-            if (isset($pre_info[1])) {
-                list($total, $info) = $pre_info;
-            } else {
-                $total = 0;
-                $info = [];
-            }
-            if (isset($tmp[-1][3])) {
-                $total += $tmp[-1][3];
-                $info = $tmp + $info;
-            }
-            IndexManager::$dictionary[$id][$hash][$shift][$mask][$threshold] =
-                [$total, $info];
+                $index->dictionary->getWordInfo($hash, true, $shift,
+                $mask, $threshold, $start_generation,
+                $num_distinct_generations, true);
+        }
+        if (!empty($pre_info[1])) {
+            list($total, $info) = $pre_info;
+        } else {
+            $total = 0;
+            $info = [];
+        }
+        if (isset($tmp[-1][3])) {
+            $total += $tmp[-1][3];
+            $info = $tmp + $info;
         }
-        return ($with_remaining_total) ?
-            IndexManager::$dictionary[$id][$hash][$shift][$mask][$threshold]:
-            IndexManager::$dictionary[$id][$hash][$shift][$mask][$threshold][1];
+        return ($with_remaining_total) ? [$total, $info] : $info;
     }
     /**
      * Returns the number of document that a given term or phrase appears in
@@ -277,7 +220,7 @@ class IndexManager implements CrawlConstants
         $threshold = -1, $start_generation = -1,
         $num_distinct_generations = C\NUM_DISTINCT_GENERATIONS)
     {
-        $index = IndexManager::getIndex($index_name);
+        $index = self::getIndex($index_name);
         if (!$index->dictionary) {
             return false;
         }
@@ -290,12 +233,12 @@ class IndexManager implements CrawlConstants
         foreach ($hashes as $hash) {
             if (is_array($hash)) {
                 list($num_docs, ) =
-                    IndexManager::getWordInfo($index_name, $hash[0],
+                    self::getWordInfo($index_name, $hash[0],
                         $hash[1], $hash[2], $threshold, $start_generation,
                         $num_distinct_generations, true);
             } else {
                 list($num_docs, ) =
-                    IndexManager::getWordInfo($index_name, $hash, 0, "",
+                    self::getWordInfo($index_name, $hash, 0, "",
                     $threshold, $start_generation, $num_distinct_generations,
                     true);
             }
@@ -306,4 +249,4 @@ class IndexManager implements CrawlConstants
         }
         return $total_num_docs;
     }
-}
\ No newline at end of file
+}
diff --git a/src/library/WebSite.php b/src/library/WebSite.php
index 3bdc8a1e0..80a8792ee 100644
--- a/src/library/WebSite.php
+++ b/src/library/WebSite.php
@@ -92,7 +92,7 @@ class WebSite
      */
     public function __construct($base_path = "")
     {
-        $this->default_server_globals = [];
+        $this->default_server_globals = ["MAX_CACHE_FILESIZE" => 2000000];
         $this->http_methods = array_keys($this->routes);
         if (empty($base_path)) {
             $pathinfo = pathinfo($_SERVER['SCRIPT_NAME']);
@@ -522,22 +522,34 @@ class WebSite
      */
     public function filePutContents($filename, $data)
     {
-        if (isset($this->file_cache['PATH'][$filename])) {
-            /*
-                we are caching realpath which already has its own cache
-                realpath's cache is based on time, ours is based on
-                the marking algorithm.
-             */
-            $path = $this->file_cache['PATH'][$filename];
-        } else {
-            $path = realpath($filename);
-            $this->file_cache['PATH'][$filename] = $path;
+        $num_bytes = strlen($data);
+        $fits_in_cache =
+            $num_bytes < $this->default_server_globals['MAX_CACHE_FILESIZE'];
+        if ($fits_in_cache) {
+            if (isset($this->file_cache['PATH'][$filename])) {
+                /*
+                    we are caching realpath which already has its own cache
+                    realpath's cache is based on time, ours is based on
+                    the marking algorithm.
+                 */
+                $path = $this->file_cache['PATH'][$filename];
+            } else {
+                $path = realpath($filename);
+                $this->file_cache['PATH'][$filename] = $path;
+            }
         }
         if ($this->isCli()) {
-            if (isset($this->file_cache['MARKED'][$path])) {
-                $this->file_cache['MARKED'][$path] = $data;
-            } else if (isset($this->file_cache['UNMARKED'][$path])) {
-                $this->file_cache['UNMARKED'][$path] = $data;
+            if ($fits_in_cache)  {
+                if (isset($this->file_cache['MARKED'][$path])) {
+                    $this->file_cache['MARKED'][$path] = $data;
+                } else if (isset($this->file_cache['UNMARKED'][$path])) {
+                    $this->file_cache['UNMARKED'][$path] = $data;
+                }
+            } else if (!empty($this->file_cache['PATH'][$filename])) {
+                $path = $this->file_cache['PATH'][$filename];
+                unset($this->file_cache['MARKED'][$path],
+                    $this->file_cache['UNMARKED'][$path],
+                    $this->file_cache['PATH'][$filename]);
             }
         }
         $num_bytes = file_put_contents($filename, $data);
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index 294b0f4ea..71994ab3d 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -144,7 +144,8 @@ class PhraseModel extends ParallelModel
                     $pattern = "/(\s)(weight:(\S)+)/";
                     preg_match_all($pattern, $query, $matches);
                     if (isset($matches[2][0])) {
-                        $base_weight = substr($matches[2][0], strlen("weight:"));
+                        $base_weight = substr($matches[2][0],
+                            strlen("weight:"));
                         $disjunct_string =
                             preg_replace($pattern, "", $disjunct_string);
                     }
@@ -754,8 +755,6 @@ class PhraseModel extends ParallelModel
                 $format_words[] = $pre_format_words[$i];
             }
         }
-        //clear index manager cache so mask stuff isn't cached *** revisit
-        IndexManager::$dictionary = [];
         return [$word_struct, $format_words];
     }
     /**
@@ -1163,7 +1162,7 @@ class PhraseModel extends ParallelModel
                 } else {
                     $cached_time = $time;
                 }
-                if (C\MAX_QUERY_CACHE_TIME > 0 &&
+                if (C\MIN_QUERY_CACHE_TIME > 0 &&
                     $cached_time > C\MAX_QUERY_CACHE_TIME) {
                     $results = false;
                 }
ViewGit