diff --git a/src/configs/Config.php b/src/configs/Config.php index 92298f9e6..afed87782 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -437,6 +437,7 @@ if (file_exists(WORK_DIRECTORY . PROFILE_FILE_NAME)) { nsdefine('API_ACCESS', true); nsdefine('REGISTRATION_TYPE', 'disable_registration'); nsdefine('USE_MAIL_PHP', true); + nsdefine('MAIL_SENDER', ''); nsdefine('MAIL_SERVER', ''); nsdefine('MAIL_PORT', ''); nsdefine('MAIL_USERNAME', ''); diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php index c0550cdef..e64726e22 100755 --- a/src/controllers/SearchController.php +++ b/src/controllers/SearchController.php @@ -87,7 +87,7 @@ class SearchController extends Controller implements CrawlConstants } else if (C\USE_FILECACHE) { $phrase_model::$cache = new FileCache(C\WORK_DIRECTORY . "/cache/queries", - $web_site); + $this->web_site); $_SERVER["USE_CACHE"] = true; } else { $_SERVER["USE_CACHE"] = false; diff --git a/src/controllers/components/SystemComponent.php b/src/controllers/components/SystemComponent.php index 04f4c3862..c098b84d7 100755 --- a/src/controllers/components/SystemComponent.php +++ b/src/controllers/components/SystemComponent.php @@ -553,9 +553,11 @@ class SystemComponent extends Component $data["ELEMENT"] = "serversettings"; switch ($arg) { case "clearCache": - $cache = new L\FileCache(C\WORK_DIRECTORY . "/cache/queries"); - $cache->clear(); - $parent->web_site->clearFileCache(); + L\IndexManager::clearCache(); + $phrase_model = $parent->model("phrase"); + if (!empty($phrase_model::$cache)) { + $phrase_model::$cache->clear(); + } return $parent->redirectWithMessage( tl('system_component_cache_cleared')); break; diff --git a/src/library/FileCache.php b/src/library/FileCache.php index ac5e1bf80..a76428501 100644 --- a/src/library/FileCache.php +++ b/src/library/FileCache.php @@ -75,7 +75,7 @@ class FileCache if (!is_dir($this->dir_name)) { mkdir($this->dir_name); chmod($this->dir_name, 0777); - $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS)."Manager"; + $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager"; $db = new $db_class(); $db->setWorldPermissionsRecursive($this->dir_name, true); } @@ -139,11 +139,6 @@ class FileCache /** * Stores in the cache a key-value pair * - * Only when a key is set is there a check for whether to invalidate - * a cache bin. It is deleted as invalid if the following two conditions - * both hold: - * The last time it was expired is more than SECONDS_IN_A_BIN seconds ago, - * and the number of cache items is more than self::MAX_FILES_IN_A_BIN. * * @param string $key to associate with value * @param mixed $value to store @@ -157,7 +152,7 @@ class FileCache chmod($checksum_dir, 0777); } $cache_file = "$checksum_dir/c".webencode($key); - $this->updateCache($key, true); + $this->updateCache($key); $this->filePutContents($cache_file, serialize($value)); } /** @@ -181,6 +176,9 @@ class FileCache */ public function clear() { + if (!empty($this->web_site)) { + $this->web_site->clearFileCache(); + } if (is_dir($this->dir_name)) { $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS)."Manager"; $db = new $db_class(); @@ -190,7 +188,7 @@ class FileCache /** * */ - protected function updateCache($key, $is_write = false) + protected function updateCache($key) { $checksum_block = $this->checksum($key); $checksum_dir = $this->dir_name . "/$checksum_block"; @@ -241,7 +239,7 @@ class FileCache unlink($delete_file); } } - if ($now - $data["TIME"] > C\ONE_HOUR) { + if ($now - $data["TIME"] > C\MIN_QUERY_CACHE_TIME) { $in_cache_files = array_flip(glob($checksum_dir . "/c*")); $keys = array_keys($data['UNMARKED']); foreach ($keys as $check_key) { diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php index aef58de05..6f8f36c81 100644 --- a/src/library/IndexArchiveBundle.php +++ b/src/library/IndexArchiveBundle.php @@ -140,21 +140,21 @@ class IndexArchiveBundle implements CrawlConstants $is_dir = is_dir($this->dir_name); if (!$is_dir && !$read_only_archive) { mkdir($this->dir_name); - mkdir($this->dir_name."/posting_doc_shards"); + mkdir($this->dir_name . "/posting_doc_shards"); } else if (!$is_dir) { return false; } else { $index_archive_exists = true; } - if (file_exists($this->dir_name."/generation.txt")) { + if (file_exists($this->dir_name . "/generation.txt")) { $this->generation_info = unserialize( - file_get_contents($this->dir_name."/generation.txt")); + file_get_contents($this->dir_name . "/generation.txt")); } else if (!$read_only_archive) { $this->generation_info['ACTIVE'] = 0; - file_put_contents($this->dir_name."/generation.txt", + file_put_contents($this->dir_name . "/generation.txt", serialize($this->generation_info)); } - $this->summaries = new WebArchiveBundle($dir_name."/summaries", + $this->summaries = new WebArchiveBundle($dir_name . "/summaries", $read_only_archive, -1, $description); if (!$read_only_archive) { $this->summaries->initCountIfNotExists("VISITED_URLS_COUNT"); @@ -164,7 +164,7 @@ class IndexArchiveBundle implements CrawlConstants $this->version = $this->summaries->version; } $this->num_docs_per_generation = $num_docs_per_generation; - $this->dictionary = new IndexDictionary($this->dir_name."/dictionary", + $this->dictionary = new IndexDictionary($this->dir_name . "/dictionary", $this); } /** @@ -470,4 +470,4 @@ class IndexArchiveBundle implements CrawlConstants { return WebArchiveBundle::getParamModifiedTime($dir_name."/summaries"); } -} \ No newline at end of file +} diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php index 76f40fd70..ee7d6b9a3 100644 --- a/src/library/IndexManager.php +++ b/src/library/IndexManager.php @@ -35,7 +35,7 @@ use seekquarry\yioop\configs as C; /** * For crawlHash */ -require_once __DIR__."/Utility.php"; +require_once __DIR__ . "/Utility.php"; /** * Class used to manage open IndexArchiveBundle's while performing * a query. Ensures an easy place to obtain references to these bundles @@ -52,11 +52,13 @@ class IndexManager implements CrawlConstants */ public static $indexes = []; /** - * Used to cache word lookup of posting list locations for a given - * index - * @var array + * + */ + public static $index_times = []; + /** + * */ - public static $dictionary = []; + const INDEX_CACHE_SIZE = 1000; /** * Returns a reference to the managed copy of an IndexArchiveBundle object * with a given timestamp or an IndexShard in the case where @@ -68,12 +70,15 @@ class IndexManager implements CrawlConstants public static function getIndex($index_name) { $index_name = trim($index_name); //trim to fix postgres quirkiness - if (!isset(self::$indexes[$index_name])) { + if (empty(self::$indexes[$index_name]) || + ($index_name == "feed" && !empty(self::$index_times["feed"]) && + (time() - self::$index_times["feed"]) > C\MIN_QUERY_CACHE_TIME) ) { if ($index_name == "feed") { $index_file = C\WORK_DIRECTORY."/feeds/index"; if (file_exists($index_file)) { self::$indexes[$index_name] = new IndexShard( $index_file, 0, C\NUM_DOCS_PER_GENERATION, true); + self::$index_times["feed"] = time(); } else { return false; } @@ -86,10 +91,30 @@ class IndexManager implements CrawlConstants } self::$indexes[$index_name] = $tmp; self::$indexes[$index_name]->setCurrentShard(0, true); + self::$index_times[$index_name] = time(); + } + if (count(self::$indexes) > self::INDEX_CACHE_SIZE) { + $times = array_values(self::$index_times); + sort($times); + $oldest_third = $times[floor(count($times)/3)]; + foreach (self::$index_times as $name => $time) { + if ($time <= $oldest_third) { + unset(self::$index_times[$name], self::$indexes[$name]); + } + } } } return self::$indexes[$index_name]; } + /** + * Clears the static variables in which caches of read in indexes + * and dictionary info is stored. + */ + public static function clearCache() + { + self::$indexes = []; + self::$index_times = []; + } /** * Returns the version of the index, so that Yioop can determine * how to do word lookup.The only major change to the format was @@ -112,7 +137,7 @@ class IndexManager implements CrawlConstants return 1; } /** - * Gets an array posting list positions for each shard in the + * Gets an array of posting list positions for each shard in the * bundle $index_name for the word id $hash * * @param string $index_name bundle to look $hash in @@ -136,32 +161,7 @@ class IndexManager implements CrawlConstants $num_distinct_generations = -1, $with_remaining_total = false) { $id = "$index_name:$start_generation:$num_distinct_generations"; - $index = IndexManager::getIndex($index_name); - if (!$index->dictionary) { - $tmp = []; - if ((!C\nsdefined('NO_FEEDS') || !C\NO_FEEDS) - && file_exists(C\WORK_DIRECTORY."/feeds/index")) { - //NO_FEEDS defined true in statistic_controller.php - $use_feeds = true; - $feed_shard = IndexManager::getIndex("feed"); - $feed_info = $feed_shard->getWordInfo($hash, true, $shift, - $mask); - if (is_array($feed_info)) { - $tmp[-1] = [-1, $feed_info[0], - $feed_info[1], $feed_info[2], $feed_info[3]]; - } - } - if ($tmp == []) { - return ($with_remaining_total) ? [0, false] : false; - } - IndexManager::$dictionary[$id][$hash][$shift][$mask][$threshold] = - [$feed_info[3], $tmp]; - return ($with_remaining_total) ? - IndexManager::$dictionary[$id][$hash][$shift][$mask][ - $threshold] : - IndexManager::$dictionary[$id][$hash][$shift][$mask][ - $threshold][1]; - } + $index = self::getIndex($index_name); $len = strlen($mask); if ($len > 0) { $pre_hash = substr($hash, 0, 8) . @@ -169,95 +169,38 @@ class IndexManager implements CrawlConstants } else { $pre_hash = $hash; } - if (!isset(IndexManager::$dictionary[$id][$hash][$shift][$mask][ - $threshold])) { - $tmp = []; - $test_mask = ""; - if (isset(IndexManager::$dictionary[$id][$pre_hash][ - $shift])) { - foreach (IndexManager::$dictionary[$id][$pre_hash][ - $shift] as $test_mask => $data) { - $mask_len = strlen($test_mask); - if ($mask_len > $len) {continue; } - $mask_found = true; - for ($k = 0; $k < $mask_len; $k++) { - if (ord($test_mask[$k]) > 0 && - $test_mask[$k] != $mask[$k]) { - $mask_found = false; - break; - } - } - if ($mask_found && isset( - IndexManager::$dictionary[$id][$pre_hash][ - $shift][$test_mask][$threshold]) ) { - list($total, $info) = - IndexManager::$dictionary[$id][$pre_hash - ][$shift][$test_mask][$threshold]; - $out_info = []; - foreach ($info as $record) { - $rid = $record[4]; - $add_flag = true; - if ($mask != "") { - for ($k = 0; $k < $len; $k++) { - $loc = 8 + $k; - if (ord($mask[$k]) > 0 && - isset($rid[$loc]) && - $rid[$loc] != $hash[$loc]) { - $add_flag = false; - break; - } - } - } - if ($add_flag) { - $out_info[$record[0]] = $record; - } else { - if ($record[3] < $total) { - $total -= $record[3]; - } - } - } - IndexManager::$dictionary[$id][$hash][$shift - ][$mask] = [$total, $out_info]; - return ($with_remaining_total) ? - IndexManager::$dictionary[$id][ - $hash][$shift][$mask] : - IndexManager::$dictionary[$id][ - $hash][$shift][$mask][1]; - } - } - } - if ((!C\nsdefined('NO_FEEDS') || !C\NO_FEEDS) && - $start_generation < 0 - && file_exists(C\WORK_DIRECTORY."/feeds/index")) { - //NO_FEEDS defined true in statistic_controller.php - $use_feeds = true; - $feed_shard = IndexManager::getIndex("feed"); - $feed_info = $feed_shard->getWordInfo($hash, true, $shift, - $mask); - if (is_array($feed_info)) { - $tmp[-1] = [-1, $feed_info[0], - $feed_info[1], $feed_info[2], $feed_info[3]]; - } + $tmp = []; + $test_mask = ""; + if ((!C\nsdefined('NO_FEEDS') || !C\NO_FEEDS) && + $start_generation < 0 + && file_exists(C\WORK_DIRECTORY . "/feeds/index")) { + //NO_FEEDS defined true in statistic_controller.php + $use_feeds = true; + $feed_shard = self::getIndex("feed"); + $feed_info = $feed_shard->getWordInfo($hash, true, $shift, + $mask); + if (is_array($feed_info)) { + $tmp[-1] = [-1, $feed_info[0], + $feed_info[1], $feed_info[2], $feed_info[3]]; } + } + if (!empty($index->dictionary)) { $pre_info = - $index->dictionary->getWordInfo($hash, true, $shift, $mask, - $threshold, $start_generation, $num_distinct_generations, true); - if (isset($pre_info[1])) { - list($total, $info) = $pre_info; - } else { - $total = 0; - $info = []; - } - if (isset($tmp[-1][3])) { - $total += $tmp[-1][3]; - $info = $tmp + $info; - } - IndexManager::$dictionary[$id][$hash][$shift][$mask][$threshold] = - [$total, $info]; + $index->dictionary->getWordInfo($hash, true, $shift, + $mask, $threshold, $start_generation, + $num_distinct_generations, true); + } + if (!empty($pre_info[1])) { + list($total, $info) = $pre_info; + } else { + $total = 0; + $info = []; + } + if (isset($tmp[-1][3])) { + $total += $tmp[-1][3]; + $info = $tmp + $info; } - return ($with_remaining_total) ? - IndexManager::$dictionary[$id][$hash][$shift][$mask][$threshold]: - IndexManager::$dictionary[$id][$hash][$shift][$mask][$threshold][1]; + return ($with_remaining_total) ? [$total, $info] : $info; } /** * Returns the number of document that a given term or phrase appears in @@ -277,7 +220,7 @@ class IndexManager implements CrawlConstants $threshold = -1, $start_generation = -1, $num_distinct_generations = C\NUM_DISTINCT_GENERATIONS) { - $index = IndexManager::getIndex($index_name); + $index = self::getIndex($index_name); if (!$index->dictionary) { return false; } @@ -290,12 +233,12 @@ class IndexManager implements CrawlConstants foreach ($hashes as $hash) { if (is_array($hash)) { list($num_docs, ) = - IndexManager::getWordInfo($index_name, $hash[0], + self::getWordInfo($index_name, $hash[0], $hash[1], $hash[2], $threshold, $start_generation, $num_distinct_generations, true); } else { list($num_docs, ) = - IndexManager::getWordInfo($index_name, $hash, 0, "", + self::getWordInfo($index_name, $hash, 0, "", $threshold, $start_generation, $num_distinct_generations, true); } @@ -306,4 +249,4 @@ class IndexManager implements CrawlConstants } return $total_num_docs; } -} \ No newline at end of file +} diff --git a/src/library/WebSite.php b/src/library/WebSite.php index 3bdc8a1e0..80a8792ee 100644 --- a/src/library/WebSite.php +++ b/src/library/WebSite.php @@ -92,7 +92,7 @@ class WebSite */ public function __construct($base_path = "") { - $this->default_server_globals = []; + $this->default_server_globals = ["MAX_CACHE_FILESIZE" => 2000000]; $this->http_methods = array_keys($this->routes); if (empty($base_path)) { $pathinfo = pathinfo($_SERVER['SCRIPT_NAME']); @@ -522,22 +522,34 @@ class WebSite */ public function filePutContents($filename, $data) { - if (isset($this->file_cache['PATH'][$filename])) { - /* - we are caching realpath which already has its own cache - realpath's cache is based on time, ours is based on - the marking algorithm. - */ - $path = $this->file_cache['PATH'][$filename]; - } else { - $path = realpath($filename); - $this->file_cache['PATH'][$filename] = $path; + $num_bytes = strlen($data); + $fits_in_cache = + $num_bytes < $this->default_server_globals['MAX_CACHE_FILESIZE']; + if ($fits_in_cache) { + if (isset($this->file_cache['PATH'][$filename])) { + /* + we are caching realpath which already has its own cache + realpath's cache is based on time, ours is based on + the marking algorithm. + */ + $path = $this->file_cache['PATH'][$filename]; + } else { + $path = realpath($filename); + $this->file_cache['PATH'][$filename] = $path; + } } if ($this->isCli()) { - if (isset($this->file_cache['MARKED'][$path])) { - $this->file_cache['MARKED'][$path] = $data; - } else if (isset($this->file_cache['UNMARKED'][$path])) { - $this->file_cache['UNMARKED'][$path] = $data; + if ($fits_in_cache) { + if (isset($this->file_cache['MARKED'][$path])) { + $this->file_cache['MARKED'][$path] = $data; + } else if (isset($this->file_cache['UNMARKED'][$path])) { + $this->file_cache['UNMARKED'][$path] = $data; + } + } else if (!empty($this->file_cache['PATH'][$filename])) { + $path = $this->file_cache['PATH'][$filename]; + unset($this->file_cache['MARKED'][$path], + $this->file_cache['UNMARKED'][$path], + $this->file_cache['PATH'][$filename]); } } $num_bytes = file_put_contents($filename, $data); diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index 294b0f4ea..71994ab3d 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -144,7 +144,8 @@ class PhraseModel extends ParallelModel $pattern = "/(\s)(weight:(\S)+)/"; preg_match_all($pattern, $query, $matches); if (isset($matches[2][0])) { - $base_weight = substr($matches[2][0], strlen("weight:")); + $base_weight = substr($matches[2][0], + strlen("weight:")); $disjunct_string = preg_replace($pattern, "", $disjunct_string); } @@ -754,8 +755,6 @@ class PhraseModel extends ParallelModel $format_words[] = $pre_format_words[$i]; } } - //clear index manager cache so mask stuff isn't cached *** revisit - IndexManager::$dictionary = []; return [$word_struct, $format_words]; } /** @@ -1163,7 +1162,7 @@ class PhraseModel extends ParallelModel } else { $cached_time = $time; } - if (C\MAX_QUERY_CACHE_TIME > 0 && + if (C\MIN_QUERY_CACHE_TIME > 0 && $cached_time > C\MAX_QUERY_CACHE_TIME) { $results = false; }