<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2023 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * @author Parth Patel (modfied to better use xpaths Chris Pollett) * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2023 * @filesource */ namespace seekquarry\yioop\library\media_jobs; use seekquarry\yioop\configs as C; use seekquarry\yioop\models\GroupModel; use seekquarry\yioop\library as L; use seekquarry\yioop\library\FetchUrl; use seekquarry\yioop\library\UrlParser; /** * A media job to periodically update descriptions of Wiki resources * using Description Search Sources */ class DescriptionUpdateJob extends MediaJob { /** * Time in current epoch when description last updated * @var int */ public $update_time; /** * Datasource object used to run db queries related to fes items * @var object */ public $db; /** * File to tell DescriptionUpdateJob that a wiki resource needs a * description */ const NEEDS_DESCRIPTION_FILE = C\APP_DIR . "/resources/needs_descriptions.txt"; /** * Initializes the last update time to far in the past so, description will * get immediately updated. Sets up connection to DB to fetch description * search sources */ public function init() { $this->update_time = 0; $this->name_server_does_client_tasks = true; $this->name_server_does_client_tasks_only = true; $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS). "Manager"; $this->db = new $db_class(); $this->db->connect(); C\nsconddefine("DESCRIPTION_UPDATE_INTERVAL", C\ONE_HOUR); } /** * Only update if its been more than a day since the last update * and there are resources requiring description update * * @return bool whether its been a daysince the last update */ public function checkPrerequisites() { $time = time(); $delta = $time - $this->update_time; if ($delta > C\DESCRIPTION_UPDATE_INTERVAL && file_exists(self::NEEDS_DESCRIPTION_FILE) && filesize(self::NEEDS_DESCRIPTION_FILE) > 0) { $this->update_time = $time; L\crawlLog("---- Performing resources description update ----"); return true; } L\crawlLog("---- Time since last update not exceeded, " . "skipping description update ----"); return false; } /** * Get the description search sources from the local database and use * those to run the same task as in the distributed setting */ public function nondistributedTasks() { $db = $this->db; $sql = "SELECT * FROM MEDIA_SOURCE WHERE TYPE='description_source'"; $result = $db->execute($sql); $sources = []; while ($source = $db->fetchArray($result)) { $this->parseDescriptionAuxInfo($source); $sources[] = $source; } $this->tasks = $sources; $this->doTasks($sources); } /** * Parses out the components of the auxiliary field of a description * source. * * @param array &source associative array of data about one particular * description */ public static function parseDescriptionAuxInfo(&$source) { $aux_parts = explode("###", html_entity_decode( $source['AUX_INFO'], ENT_QUOTES)); list($source['AUX_INFO'], $source['ITEM_XPATH'], $source['TITLE_XPATH'],$source['URL_XPATH'], , , , $source['TEST_DATA']) = $aux_parts; } /** * For each resource requiring description update, use the description * search sources to find information * * @param array $tasks array of description sources */ public function doTasks($tasks) { $this->page_id_thumb_folder_paths = explode("\n", file_get_contents(self::NEEDS_DESCRIPTION_FILE)); $this->page_id_thumb_folder_paths = array_unique( $this->page_id_thumb_folder_paths); if (!is_array($tasks) || !is_array($this->page_id_thumb_folder_paths)) { L\crawlLog( "---- This media updater is NOT responsible for " . "any description update! ----"); return; } L\crawlLog("---- This media updater is responsible for " . "the description updates ----"); $page_id_thumb_folder_paths = $this->page_id_thumb_folder_paths; foreach ($page_id_thumb_folder_paths as $page_id_thumb_folder_path) { $time = time(); if ($time - $this->update_time >= C\ONE_HOUR) { L\crawlLog("---- Runtime limit exceeded, saving the current " . "state and yielding the processor ----"); file_put_contents(self::NEEDS_DESCRIPTION_FILE, implode(PHP_EOL, $this->page_id_thumb_folder_paths)); return; } $this->updateResourcesDescription($tasks, $page_id_thumb_folder_path); array_shift($this->page_id_thumb_folder_paths); } file_put_contents(self::NEEDS_DESCRIPTION_FILE, ""); } /** * Updates/finds descriptions for resources listed in a * needs_description.txt in a wiki pages thumb subfolder. * It does this by iterating over all configured description search sources * a until a match is found. It then saves the description in file at given * resource thumb folder path * * @param string $thumb_folder_path path to sub-folders * needs_description.txt file * @param array $sources associative array containing details of all search * sources * @param boolean $test_mode used to return string in test mode * @return string if $test_mode true */ public function updateResourcesDescription($sources, $page_id_thumb_folder_path = "", $test_mode = false) { $page_name = ""; $thumb_folder_path = ""; $page_path_parts = explode(":", $page_id_thumb_folder_path, 2); if (count($page_path_parts) == 2) { list($page_id, $thumb_folder_path) = $page_path_parts; $group_model = new GroupModel(); $page_info = $group_model->getPageInfoByPageId($page_id); $page_name = $page_info['PAGE_NAME'] ?? ""; } else if (!$test_mode) { return false; } if (!$test_mode && !file_exists($thumb_folder_path)) { return false; } $test_results = ""; $log_function = function ($msg, $log_tag = "div class='source-test'") use (&$test_results, $test_mode) { $close_tag= preg_split("/\s+/",$log_tag)[0]; if ($test_mode) { $test_results .= "<$log_tag style='overflow-x: scroll;'>$msg</$close_tag>\n"; } else { L\crawlLog($msg); } }; $file_path = $thumb_folder_path . "/needs_description.txt"; if (!$test_mode) { $log_function("---- Processing file $file_path ----"); } $resource_details = !$test_mode ? (file_exists($file_path) ? file($file_path) : [] ) : explode("\n", $sources[0]['TEST_DATA']); $i = 1; $resource_details = array_filter($resource_details); $resource_details_copy = $resource_details; foreach ($resource_details as $resource_detail) { $time = time(); if (!$test_mode && $time - $this->update_time >= C\ONE_HOUR) { file_put_contents($file_path, implode(PHP_EOL, $resource_details_copy)); return true; } $log_function("Processing $i - $resource_detail", "h3"); array_shift($resource_details_copy); $resource_detail = trim($resource_detail); $resource_name = trim(preg_replace('/\s+/', ' ', $resource_detail)); $resource_name = pathinfo($resource_name)['filename']; $resource_name = preg_replace('/\s+/', '%20', $resource_name); $max_score = 0; $details_page_url = ""; $found_details = false; foreach ($sources as $source) { $source_name = $source['NAME']; if ($this->matchResourceSourcePathTerms( $page_name . "/". $thumb_folder_path . "/" . $resource_detail, $source['CATEGORY'])){ $log_function("*** Using search source <b>$source_name" . "</b> to find description ***", "p"); $search_page_url = $source['SOURCE_URL'] . $resource_name; $log_function(" Search Page URL - $search_page_url", "pre"); $search_page = FetchUrl::getPage($search_page_url); if (empty($search_page)) { $log_function("<span class='red'>No search results". "found for $resource_name</span>", "p"); continue; } set_error_handler(null); $dom = L\getDomFromString($search_page); $dom_xpath = new \DOMXPath($dom); $items = @$dom_xpath->evaluate($source['ITEM_XPATH']); set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); foreach ($items as $item) { $processed_result = $this->processItem($item, $resource_name, $source, $dom, $test_mode); $test_results .= $processed_result[2]; if ($processed_result[0] > $max_score) { $max_score = $processed_result[0]; $details_page_url = $processed_result[1]; } } if (!empty($details_page_url)) { $log_function("<b>Selected Details Page URL - " . "$details_page_url</b>", "pre"); $details_page = FetchUrl::getPage($details_page_url); if (empty($details_page)) { $log_function("<span class='red'>Details page". " not available</span>", "p"); continue; } list($details, $test_info) = $this->getDetails( $details_page, $source, $test_mode); $test_results .= $test_info; if (!empty($details)) { $found_details = true; if ($test_mode) { $log_function("*** Found below details ***", "p"); $log_function("$details", "pre"); } else { file_put_contents($thumb_folder_path . "/$resource_detail.txt", $details); } break; } } } } if (!$found_details && !$test_mode) { file_put_contents($thumb_folder_path . "/$resource_detail.txt", "Description search sources". " failed to find description."); } $i++; } if (!$test_mode) { file_put_contents($file_path, ""); } return $test_mode ? $test_results : true; } /** * Checks if the terms wiki page name followed by a path to a wiki resource * contain the terms in a description search source string which would * trigger that search source to get used * @param string $page_name_folder_path path to check terms against search * search source trigger terms. The mimetype of the resource is also added * to the list of terms to check * @param string $source_term_string a comma separated list of terms * used by a description source to see if it can supply a description of * the given resource. * @return bool whether the path contained any of the source trigger terms */ public function matchResourceSourcePathTerms($page_name_resource_path, $source_term_string) { $page_name_resource_path = preg_replace("/\_/u", " ", $page_name_resource_path); $source_term_string = preg_replace("/\_/u", " ", $source_term_string); $mime_type = L\mimeType($page_name_resource_path, true); $mime_type_major = explode("/", $mime_type)[0]; $check_parts = explode("/", mb_strtolower($page_name_resource_path)); $source_terms = preg_split("/\s*,\s*/u", trim(mb_strtolower( $source_term_string))); $check_parts[] = $mime_type; $check_parts[] = $mime_type_major; foreach($check_parts as $check_part) { if (in_array($check_part, $source_terms)) { return true; } } return false; } /** * Processes $item, a DOMElement representing a search result for * a description for the wiki resource $name, extracting a title and * url. Form the title a match score with $name is obtained. This score * and url as well as in test mode log messages are returned. * * @param $item DOMNode representing one possible description search result * @param $name the wiki resource name we are trying to get a description * of * @param $source the source associative array with information about how * to extract description from the current dom document and dom node. * @param $dom DOMDocument of whole document node is from, used in * creating DOMXpath object for quering $item. * @return array $score, $url, $test_results $score of $item as a likely * source for a description for the wiki resource $name, $url * that $item point to with more information, $test_results log messages * if in test mode. */ public function processItem($item, $name, $source, $dom, $test_mode = false) { if (!$item->hasChildNodes()) { return [0, null, ""]; } $test_results = ""; $log_function = function ($msg, $log_tag = "div class='source-test'") use (&$test_results, $test_mode) { $close_tag = preg_split("/\s+/",$log_tag)[0]; if ($test_mode) { $test_results .= "<$log_tag style='overflow-x: scroll;'>$msg</$close_tag>\n"; } else { L\crawlLog($msg); } }; $log_function("*** Processing item ***", "p"); $dom_xpath = new \DOMXPath($dom); $title = ""; if ($source['TITLE_XPATH'][0] == "/") { $source['TITLE_XPATH'] = "." . $source['TITLE_XPATH']; } $title_nodes = @$dom_xpath->evaluate($source['TITLE_XPATH'], $item); if (!empty($title_nodes) && !empty($title_nodes->item(0))) { $title = trim(mb_strtolower($title_nodes->item(0)->textContent)); similar_text($name, $title, $score); } if ($source['URL_XPATH'][0] == "/") { $source['URL_XPATH'] = "." . $source['URL_XPATH']; } $url_nodes = @$dom_xpath->evaluate($source['URL_XPATH'], $item); $url_parts = parse_url($source['SOURCE_URL']); $base_url = $url_parts['scheme']."://".$url_parts['host']; if (!empty($url_nodes) && !empty($url_nodes->item(0))) { $url = $url_nodes->item(0)->textContent; $url = UrlParser::canonicalLink($url, $base_url); } if (!empty($title) && !empty($url)) { $log_function(" <b>Title:</b> $title", "pre"); $log_function(" <b>URL:</b> $url", "pre"); $log_function(" <b>Title Match Percentage:</b> $score", "pre"); return [$score, $url, $test_results]; } return [0, null, $test_results]; } /** * Fetches the details on the url page using the xpaths * values configured in search source * * @param $page string the html string of the details page * @param $source array search source details * @return $details string details found using xpaths */ public function getDetails($page, $source, $test_mode = false) { $test_results = ""; $log_function = function ($msg, $log_tag = "div class='source-test'") use (&$test_results, $test_mode) { $close_tag= preg_split("/\s+/",$log_tag)[0]; if ($test_mode) { $test_results .= "<$log_tag style='overflow-x: scroll;'>$msg</$close_tag>\n"; } else { L\crawlLog($msg); } }; set_error_handler(null); $dom = L\getDomFromString($page); $details = ""; set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); if (empty($dom)) { $log_function("<span class='red'>Error creating DOM</span>", "pre"); return $details; } $detail_items = explode("\n", $source['AUX_INFO']); $details = ""; foreach ($detail_items as $detail_item) { $info_found = false; $sub_details = ""; list($detail_name, $detail_xpath, ) = explode("|", $detail_item); $detail_name = trim($detail_name); $dom_xpath = new \DOMXPath($dom); $detail_nodes = @$dom_xpath->evaluate($detail_xpath); if (!empty($detail_nodes)) { if (is_string($detail_nodes)) { $sub_details .= trim(preg_replace('/\s+/', ' ', $detail_nodes)) . "\n"; $info_found = true; } else { foreach ($detail_nodes as $detail_node) { if (!empty($detail_node->textContent)) { $entry_text = trim(preg_replace('/\s+/', ' ', $detail_node->textContent)) . "\n"; if (!str_contains($sub_details, $entry_text)) { $sub_details .= $entry_text; $info_found = true; } } } } } if (!$info_found) { $log_function("<span class='red'>Could not fetch value". " for <b>$detail_name</b></span>", "pre"); } else { $details .= "= $detail_name =\n" . $sub_details . "\n"; } } return [wordwrap($details), $test_results]; } }