viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2023 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2023 * @filesource */ namespace seekquarry\yioop\executables; use seekquarry\yioop\configs as C; use seekquarry\yioop\library as L; use seekquarry\yioop\library\media_jobs as MJ; use seekquarry\yioop\library\CrawlConstants; use seekquarry\yioop\library\IndexDocumentBundle; if (php_sapi_name() != 'cli' || defined("seekquarry\\yioop\\configs\\IS_OWN_WEB_SERVER")) { echo "BAD REQUEST"; exit(); } /** For crawlHash, crawlHashWord function */ require_once __DIR__."/../library/Utility.php"; ini_set("memory_limit", C\ARC_TOOL_MEMORY_LIMIT); /*reading in a whole shard might take a fair bit of memory */ /* * We'll set up multi-byte string handling to use UTF-8 */ mb_internal_encoding("UTF-8"); mb_regex_encoding("UTF-8"); /** * * * @author Chris Pollett */ class DictionaryUpdater implements CrawlConstants { /** * The main code for the dictionary updater, updates the the * dictionary for the IndexDocumentBundle at $bundle_path running * on channel $channel from its current next_partition to process * to the current save partition. Partitions are groups of documents * that have been downloaded, but whose words ave not necessarily been * add to the dicitionary for the bundle. * * @param int $channel the channel the crawl is running on. Used in * naming lock files * @param string $bundle_path the path to the IndexDocumentBundle or * FeedDucumentBundle we are adding dictionary info for */ public static function run($channel, $bundle_path) { if (!isset($_SERVER["LOG_TO_FILES"])) { $_SERVER["LOG_TO_FILES"] = true; $process_name = $channel . "-DictionaryUpdater"; L\crawlLog("\n\nInitialize logger..", $process_name, true); } $archive_type = self::getArchiveKind($bundle_path); if (!$archive_type) { L\crawlLog("Unknown Archive Type Exiting..."); exit(); } $archive_name = C\NS_LIB . $archive_type; if ($archive_type == "FeedDocumentBundle") { $dbms_manager = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager"; $db = new $dbms_manager(); $db->connect(); $index_archive = new $archive_name($bundle_path, $db, false, null, C\NUM_DOCS_PER_PARTITION); $sql = "SELECT * FROM MEDIA_SOURCE WHERE (TYPE='rss' OR TYPE='html' OR TYPE='json' OR TYPE='regex')"; $result = $db->execute($sql); $feeds = []; while ($feed = $db->fetchArray($result)) { MJ\FeedsUpdateJob::parseFeedAuxInfo($feed); $feeds[] = $feed; } $index_archive->feeds = $feeds; } else { $index_archive = new $archive_name($bundle_path, false, "", C\NUM_DOCS_PER_PARTITION); } $next_partition_path = $bundle_path . "/". IndexDocumentBundle::NEXT_PARTITION_FILE; if (file_exists($next_partition_path)) { $index_archive->next_partition_to_add = intval( file_get_contents($next_partition_path)); } else { $index_archive->next_partition_to_add = 0; file_put_contents($next_partition_path, $index_archive->next_partition_to_add); } //note the false parameter means will only update for one partition $index_archive->updateDictionary($next_partition_path, false); } /** * Given a folder name, determines the kind of bundle (if any) it holds. * It does this based on the expected location of the description.txt file, * or arc_description.ini (in the case of a non-yioop archive) * * @param string $archive_path the path to archive folder * @return string the archive bundle type, either: WebArchiveBundle or * IndexArchiveBundle */ public static function getArchiveKind($archive_path) { if (file_exists("$archive_path/bundle0")) { if (file_exists("$archive_path/bundle0/summaries/description.txt")){ return "DoubleIndexBundleOld"; } else { return "DoubleIndexBundle"; } } if (file_exists("$archive_path/description.txt")) { return "WebArchiveBundle"; } if (file_exists("$archive_path/filter_a.ftr")) { if (file_exists("$archive_path/summaries/description.txt")) { return "FeedArchiveBundle"; } else { return "FeedDocumentBundle"; } } if (file_exists("$archive_path/summaries/description.txt")) { return "IndexArchiveBundle"; } else if (file_exists("$archive_path/archive_info.txt")) { return "IndexDocumentBundle"; } $desc_path = "$archive_path/arc_description.ini"; if (file_exists($desc_path)) { $desc = L\parse_ini_with_fallback($desc_path); if (!isset($desc['arc_type'])) { return false; } return $desc['arc_type']; } return false; } } if (!empty($argv[3]) && in_array($argv[1], ["run", "terminal"])) { /* * Instantiate and runs the QueueSever */ if ($argv[1] == "terminal") { $_SERVER["LOG_TO_FILES"] = "terminal"; } DictionaryUpdater::run($argv[2], $argv[3]); }