First pass at modifying Yioop to again use Logarithmic Merge Trees for its dictionary structures
First pass at modifying Yioop to again use Logarithmic Merge Trees for its dictionary structures
Folder structure of IndexDocumentBundles also modified and now supports overflow folder (which
could be on a different hard drive). ArcTool has been updated to support migration to new
indexes
<?php
/**
* SeekQuarry/Yioop --
* Open Source Pure PHP Search Engine, Crawler, and Indexer
*
* Copyright (C) 2009 - 2023 Chris Pollett chris@pollett.org
*
* LICENSE:
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* END LICENSE
*
* @author Chris Pollett chris@pollett.org
* @license https://www.gnu.org/licenses/ GPL3
* @link https://www.seekquarry.com/
* @copyright 2009 - 2023
* @filesource
*/
namespace seekquarry\yioop\executables;
use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\library\media_jobs as MJ;
use seekquarry\yioop\library\CrawlConstants;
use seekquarry\yioop\library\IndexDocumentBundle;
if (php_sapi_name() != 'cli' ||
defined("seekquarry\\yioop\\configs\\IS_OWN_WEB_SERVER")) {
echo "BAD REQUEST"; exit();
}
/** For crawlHash, crawlHashWord function */
require_once __DIR__."/../library/Utility.php";
ini_set("memory_limit", C\ARC_TOOL_MEMORY_LIMIT); /*reading in a whole
shard might take a fair bit of memory
*/
/*
* We'll set up multi-byte string handling to use UTF-8
*/
mb_internal_encoding("UTF-8");
mb_regex_encoding("UTF-8");
/**
*
*
* @author Chris Pollett
*/
class DictionaryUpdater implements CrawlConstants
{
/**
* The main code for the dictionary updater, updates the the
* dictionary for the IndexDocumentBundle at $bundle_path running
* on channel $channel from its current next_partition to process
* to the current save partition. Partitions are groups of documents
* that have been downloaded, but whose words ave not necessarily been
* add to the dicitionary for the bundle.
*
* @param int $channel the channel the crawl is running on. Used in
* naming lock files
* @param string $bundle_path the path to the IndexDocumentBundle or
* FeedDucumentBundle we are adding dictionary info for
*/
public static function run($channel, $bundle_path)
{
if (!isset($_SERVER["LOG_TO_FILES"])) {
$_SERVER["LOG_TO_FILES"] = true;
$process_name = $channel . "-DictionaryUpdater";
L\crawlLog("\n\nInitialize logger..", $process_name, true);
}
$archive_type = self::getArchiveKind($bundle_path);
if (!$archive_type) {
L\crawlLog("Unknown Archive Type Exiting...");
exit();
}
$archive_name = C\NS_LIB . $archive_type;
if ($archive_type == "FeedDocumentBundle") {
$dbms_manager = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager";
$db = new $dbms_manager();
$db->connect();
$index_archive = new $archive_name($bundle_path, $db,
false, null, C\NUM_DOCS_PER_PARTITION);
$sql = "SELECT * FROM MEDIA_SOURCE WHERE (TYPE='rss'
OR TYPE='html' OR TYPE='json' OR TYPE='regex')";
$result = $db->execute($sql);
$feeds = [];
while ($feed = $db->fetchArray($result)) {
MJ\FeedsUpdateJob::parseFeedAuxInfo($feed);
$feeds[] = $feed;
}
$index_archive->feeds = $feeds;
} else {
$index_archive = new $archive_name($bundle_path,
false, "", C\NUM_DOCS_PER_PARTITION);
}
$next_partition_path = $bundle_path . "/".
IndexDocumentBundle::NEXT_PARTITION_FILE;
if (file_exists($next_partition_path)) {
$index_archive->next_partition_to_add = intval(
file_get_contents($next_partition_path));
} else {
$index_archive->next_partition_to_add = 0;
file_put_contents($next_partition_path,
$index_archive->next_partition_to_add);
}
//note the false parameter means will only update for one partition
$index_archive->updateDictionary($next_partition_path, false);
}
/**
* Given a folder name, determines the kind of bundle (if any) it holds.
* It does this based on the expected location of the description.txt file,
* or arc_description.ini (in the case of a non-yioop archive)
*
* @param string $archive_path the path to archive folder
* @return string the archive bundle type, either: WebArchiveBundle or
* IndexArchiveBundle
*/
public static function getArchiveKind($archive_path)
{
if (file_exists("$archive_path/bundle0")) {
if (file_exists("$archive_path/bundle0/summaries/description.txt")){
return "DoubleIndexBundleOld";
} else {
return "DoubleIndexBundle";
}
}
if (file_exists("$archive_path/description.txt")) {
return "WebArchiveBundle";
}
if (file_exists("$archive_path/filter_a.ftr")) {
if (file_exists("$archive_path/summaries/description.txt")) {
return "FeedArchiveBundle";
} else {
return "FeedDocumentBundle";
}
}
if (file_exists("$archive_path/summaries/description.txt")) {
return "IndexArchiveBundle";
} else if (file_exists("$archive_path/archive_info.txt")) {
return "IndexDocumentBundle";
}
$desc_path = "$archive_path/arc_description.ini";
if (file_exists($desc_path)) {
$desc = L\parse_ini_with_fallback($desc_path);
if (!isset($desc['arc_type'])) {
return false;
}
return $desc['arc_type'];
}
return false;
}
}
if (!empty($argv[3]) && in_array($argv[1], ["run", "terminal"])) {
/*
* Instantiate and runs the QueueSever
*/
if ($argv[1] == "terminal") {
$_SERVER["LOG_TO_FILES"] = "terminal";
}
DictionaryUpdater::run($argv[2], $argv[3]);
}