Take 2 on last
<?php
/**
* SeekQuarry/Yioop --
* Open Source Pure PHP Search Engine, Crawler, and Indexer
*
* Copyright (C) 2009 - 2024 Chris Pollett chris@pollett.org
*
* LICENSE:
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* END LICENSE
*
* @author Chris Pollett chris@pollett.org
* @license https://www.gnu.org/licenses/ GPL3
* @link https://www.seekquarry.com/
* @copyright 2009 - 2024
* @filesource
*/
namespace seekquarry\yioop\library;
use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\models\ParallelModel;
/**
* For crawlHash
*/
require_once __DIR__ . "/Utility.php";
/**
* Class used to manage open IndexArchiveBundle's while performing
* a query. Ensures an easy place to obtain references to these bundles
* and ensures only one object per bundle is instantiated in a Singleton-esque
* way.
*
* @author Chris Pollett
*/
class IndexManager implements CrawlConstants
{
/**
* Open IndexArchiveBundle's managed by this manager
* @var array
*/
public static $indexes = [];
/**
* List of entries of the form name of bundle => time when cached
* @var array
*/
public static $index_times = [];
/**
* List of entries of the form name of url => doc_map info when cached
* @var array
*/
public static $urls_cache = [];
/**
* Max number of IndexArchiveBundles that can be cached
*/
const INDEX_CACHE_SIZE = 1000;
/**
* Max number of URLs to be cached for most recent version of a page lookup
*/
const URLS_CACHE_SIZE = 1000;
/**
* Max number of Word Info items that can be cached
*/
const INFO_CACHE_SIZE = 1000;
/**
* Returns a reference to the managed copy of an IndexArchiveBundle object
* with a given timestamp or feed (for handling media feeds)
*
* @param string $index_name timestamp of desired IndexArchiveBundle
* @return object the desired IndexArchiveBundle reference
*/
public static function getIndex($index_name)
{
$index_name = trim($index_name ?? ""); //trim to fix postgres quirkiness
$cache_dir = C\CACHE_DIR . '/';
$index_archive_full_path = $cache_dir . self::index_data_base_name .
$index_name;
$handled = false;
if ($index_name == "feed" || $index_name == self::FEED_CRAWL_TIME) {
$index_archive_name = self::feed_index_data_base_name;
$index_name = "feed";
$handled = true;
} else if (is_numeric($index_name) &&
file_exists($index_archive_full_path)) {
$index_archive_name = self::index_data_base_name . $index_name;
$handled = true;
}
if (!$handled && is_numeric($index_name) ) {
$index_name = $cache_dir . self::double_index_base_name .
$index_name;
}
if (!$handled && file_exists($index_name) ) {
$is_repeating = file_exists("$index_name/bundle0");
$serve_archive = "0";
if ($is_repeating) {
$status_file = "$index_name/status.txt";
if (file_exists($status_file)) {
$status = unserialize(file_get_contents($status_file));
$active_archive = (empty($status["swap_count"])) ? 1 :
$status["swap_count"] % 2;
$serve_archive = 1 - $active_archive;
}
}
$sub_folder = ($is_repeating) ? "/bundle$serve_archive" : "";
$is_old = ($is_repeating) ? (file_exists($index_name.
"/bundle0/summaries")) : (file_exists($index_name .
"/summaries"));
$bundle_class_name = ($is_old) ? C\NS_LIB . "IndexArchiveBundle"
: C\NS_LIB . "IndexDocumentBundle";
$tmp = new $bundle_class_name($index_name . $sub_folder);
} else if (!$handled) {
return false;
}
if (empty(self::$indexes[$index_name]) ||
(!empty(self::$index_times[$index_name]) &&
($index_name == 'feed' && php_sapi_name() == 'cli') &&
(time() - self::$index_times[$index_name])
> C\MIN_QUERY_CACHE_TIME) ) {
if (!isset($tmp)) {
$index_filename = $cache_dir . $index_archive_name;
if (file_exists($index_filename)) {
$is_old = (file_exists($index_filename . "/summaries"));
$bundle_class_name = ($is_old) ? C\NS_LIB .
"IndexArchiveBundle" : C\NS_LIB . "IndexDocumentBundle";
$tmp = new $bundle_class_name($cache_dir .
$index_archive_name);
if (!$tmp) {
return false;
}
} else {
$tmp = false;
$use_name = $index_name;
$serve_archive = -1;
if (preg_match("/\-\d$/", $index_name)) {
$serve_archive = substr($index_name, -1);
$use_name = substr($index_name, 0, -2);
}
$index_archive_name = self::double_index_base_name .
$use_name;
$status_file = $cache_dir . $index_archive_name .
"/status.txt";
if ($serve_archive < 0 && file_exists($status_file)) {
$status = unserialize(file_get_contents($status_file));
$active_archive = (empty($status["swap_count"])) ? 1 :
$status["swap_count"] % 2;
$serve_archive = 1 - $active_archive;
}
$is_old = (file_exists($index_filename .
"/bundle0/summaries") ||
file_exists($index_filename . "/bundle1/summaries"));
$bundle_class_name = ($is_old) ?
C\NS_LIB . "IndexArchiveBundle"
: C\NS_LIB . "IndexDocumentBundle";
$tmp = new $bundle_class_name($cache_dir .
$index_archive_name . "/bundle$serve_archive");
if (!$tmp) {
$serve_archive = ($serve_archive == 0) ? 1 : 0;
$tmp = new $bundle_class_name($cache_dir .
$index_archive_name . "/bundle$serve_archive");
}
if (!$tmp) {
return false;
}
}
}
self::$indexes[$index_name] = $tmp;
if ($is_old) {
self::$indexes[$index_name]->setCurrentShard(0, true);
}
self::$index_times[$index_name] = time();
/*
If too many cached discard oldest 1/3 of cached indices
*/
if (count(self::$indexes) > self::INDEX_CACHE_SIZE) {
$times = array_values(self::$index_times);
sort($times);
$oldest_third = $times[floor(count($times)/3)];
foreach (self::$index_times as $name => $time) {
if ($time <= $oldest_third) {
unset(self::$index_times[$name], self::$indexes[$name]);
}
}
}
}
return self::$indexes[$index_name];
}
/**
* Clears the static variables in which caches of read in indexes
* and dictionary info is stored.
*/
public static function clearCache()
{
self::$indexes = [];
self::$index_times = [];
}
/**
* Returns the version of the index, so that Yioop can determine
* how to do word lookup.The only major change to the format was
* when word_id's went from 8 to 20 bytes which happened around Unix
* time 1369754208.
*
* @param string $index_name unix timestamp of index
* @return int 0 - if the original format for Yioop indexes; 1 -if 20 byte
* word_id format
*/
public static function getVersion($index_name)
{
$index_name = (string) $index_name;
$index_name = (empty($index_name) || $index_name[0] != '-') ?
$index_name : substr($index_name, 1);
$index_name_int = intval($index_name);
if (!is_numeric($index_name)) {
$description_file = $index_name . "/summaries/description.txt";
if (file_exists($description_file)) {
$description = unserialize(
file_get_contents($description_file));
if (!empty($description['DESCRIPTION'])) {
$description = unserialize($description['DESCRIPTION']);
}
if (!empty($description[self::CRAWL_TIME])) {
if (intval($description[self::CRAWL_TIME]) <
C\VERSION_0_TIMESTAMP &&
intval($description[self::CRAWL_TIME]) !=
self::FEED_CRAWL_TIME) {
return 0;
}
}
}
} else if ($index_name_int != self::FEED_CRAWL_TIME &&
$index_name_int < C\VERSION_0_TIMESTAMP) {
return 0;
}
$tmp_index = self::getIndex($index_name);
if (isset($tmp_index->version)) {
return $tmp_index->version;
} else if (isset($tmp_index->archive_info['VERSION'])) {
return $tmp_index->archive_info['VERSION'];
}
return C\DEFAULT_CRAWL_FORMAT;
}
/**
* Gets an array of posting list positions for each shard in the
* bundle $index_name for the word id $term_id
*
* @param string $index_name bundle to look for $term_id in
* @param string $term_id id of phrase or word to look up in bundle
* dictionary
* @param int $threshold after the number of results exceeds this amount
* stop looking for more dictionary entries.
* @param int $start_generation what generation in the index to start
* finding occurrence of phrase from
* @param int $num_distinct_generations from $start_generation how
* many generation to search forward to
* @param bool $with_remaining_total whether to total number of
* postings found as well or not
* @return array either [total, sequence of four tuples]
* or sequence of four tuples:
* (index_shard generation, posting_list_offset, length, exact id
* that match $term_id)
*/
public static function getWordInfo($index_name, $term_id, $threshold = -1,
$start_generation = -1, $num_distinct_generations = -1,
$with_remaining_total = false)
{
static $info_cache = [];
$lookup_hash = crawlHash($index_name . $term_id . $threshold .
$start_generation . $num_distinct_generations .
$with_remaining_total);
if (isset($info_cache[$lookup_hash])) {
$tmp = $info_cache[$lookup_hash];
unset($info_cache[$lookup_hash]);
$info_cache[$lookup_hash] = $tmp;
return $tmp;
}
$index = self::getIndex($index_name);
$start_generation = ($start_generation < 0) ? 0 : $start_generation;
$word_info = $index->getWordInfo($term_id,
$threshold, $start_generation, $num_distinct_generations,
$with_remaining_total);
$info_cache[$lookup_hash] = $word_info;
if (count($info_cache) >= self::INFO_CACHE_SIZE) {
array_shift($info_cache);
}
return $word_info;
}
/**
* Finds posting info related to the most recent version
* of a URL in the given index
*
* @param string hash of the URL to be looked up
* @param string current index
* @return array of posting info | null
*/
public static function lookupLatestVersionPage($url_hash, $index_name)
{
// Check if the url hash exists in the cache
if (array_key_exists($url_hash, self::$urls_cache)) {
return self::$urls_cache[$url_hash];
}
$model_for_url_hash_lookup = new ParallelModel();
$page_versions = $model_for_url_hash_lookup->
lookupSummaryOffsetGeneration(L\base64Hash($url_hash),
$index_name, false, true);
if (!empty($page_versions['ROWS']) &&
count($page_versions['ROWS']) > 0) {
$latest_postings_info =
end($page_versions['ROWS'])['POSTINGS'] ?? null;
$latest_partition =
end($page_versions['ROWS'])['PARTITION'] ?? null;
if (is_array($latest_postings_info) &&
count($latest_postings_info) > 0) {
$latest_posting = end($latest_postings_info);
if (count(self::$urls_cache) >= self::URLS_CACHE_SIZE) {
self::$urls_cache = [];
}
self::$urls_cache[$url_hash] = array($latest_partition,
$latest_posting);
return self::$urls_cache[$url_hash];
}
}
return null;
}
}