Last commit for src/library/media_jobs/DescriptionUpdateJob.php: c342934e7c723fccbc56a8d95d29d20fee50df46

Improve DescriptionUpdateJob so path matching uses regex, update example search sources and wiki

Chris Pollett [2024-04-13 05:Apr:th]

Improve DescriptionUpdateJob so path matching uses regex, update example search sources and wiki

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Parth Patel (modfied to better use xpaths Chris Pollett)
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\library\media_jobs;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\models\GroupModel;
use seekquarry\yioop\library as L;
use seekquarry\yioop\library\FetchUrl;
use seekquarry\yioop\library\UrlParser;

/**
* A media job to periodically update descriptions of Wiki resources
* using Description Search Sources
*/
class DescriptionUpdateJob extends MediaJob
{
    /**
     * Time in current epoch when description last updated
     * @var int
     */
    public $update_time;
    /**
     * Datasource object used to run db queries related to fes items
     * @var object
     */
    public $db;
    /**
     * File to tell DescriptionUpdateJob that a wiki resource needs a
     * description
     */
    const NEEDS_DESCRIPTION_FILE = C\APP_DIR .
        "/resources/needs_descriptions.txt";
    /**
     * Initializes the last update time to far in the past so, description will
     * get immediately updated. Sets up connection to DB to fetch description
     * search sources
     */
    public function init()
    {
        $this->update_time = 0;
        $this->name_server_does_client_tasks = true;
        $this->name_server_does_client_tasks_only = true;
        $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS). "Manager";
        $this->db = new $db_class();
        $this->db->connect();
        C\nsconddefine("DESCRIPTION_UPDATE_INTERVAL", C\ONE_HOUR);
    }
    /**
     * Only update if its been more than a day since the last update
     * and there are resources requiring description update
     *
     * @return bool whether its been a daysince the last update
     */
    public function checkPrerequisites()
    {
        $time = time();
        $delta = $time - $this->update_time;
        if ($delta > C\DESCRIPTION_UPDATE_INTERVAL &&
            file_exists(self::NEEDS_DESCRIPTION_FILE) &&
            filesize(self::NEEDS_DESCRIPTION_FILE) > 0) {
            $this->update_time = $time;
            L\crawlLog("---- Performing resources description update ----");
            return true;
        }
        L\crawlLog("---- Time since last update not exceeded, " .
            "skipping description update ----");
        return false;
    }
    /**
     * Get the description search sources from the local database and use
     * those to run the same task as in the distributed setting
     */
    public function nondistributedTasks()
    {
        $db = $this->db;
        $sql = "SELECT * FROM MEDIA_SOURCE WHERE TYPE='description_source'";
        $result = $db->execute($sql);
        $sources = [];
        while ($source = $db->fetchArray($result)) {
            $this->parseDescriptionAuxInfo($source);
            $sources[] = $source;
        }
        $this->tasks = $sources;
        $this->doTasks($sources);
    }
    /**
     * Parses out the components of the auxiliary field of a description
     * source.
     *
     * @param array &source associative array of data about one particular
     *  description
     */
    public static function parseDescriptionAuxInfo(&$source)
    {
        $aux_parts = explode("###", html_entity_decode(
            $source['AUX_INFO'], ENT_QUOTES));
        list($source['AUX_INFO'], $source['ITEM_XPATH'],
            $source['TITLE_XPATH'],$source['URL_XPATH'],
            , , , $source['TEST_DATA']) = $aux_parts;
    }
    /**
     * For each resource requiring description update, use the description
     * search sources to find information
     *
     * @param array $tasks array of description sources
     */
    public function doTasks($tasks)
    {
        $this->page_id_thumb_folder_paths = explode("\n",
            file_get_contents(self::NEEDS_DESCRIPTION_FILE));
        $this->page_id_thumb_folder_paths = array_unique(
            $this->page_id_thumb_folder_paths);
        if (!is_array($tasks) || !is_array($this->page_id_thumb_folder_paths)) {
            L\crawlLog(
                "---- This media updater is NOT responsible for " .
                "any description update! ----");
            return;
        }
        L\crawlLog("---- This media updater is responsible for " .
            "the description updates ----");
        $page_id_thumb_folder_paths = $this->page_id_thumb_folder_paths;
        foreach ($page_id_thumb_folder_paths as $page_id_thumb_folder_path) {
            $time = time();
            if ($time - $this->update_time >= C\ONE_HOUR) {
                L\crawlLog("---- Runtime limit exceeded, saving the current " .
                    "state and yielding the processor ----");
                file_put_contents(self::NEEDS_DESCRIPTION_FILE, implode(PHP_EOL,
                    $this->page_id_thumb_folder_paths));
                return;
            }
            $this->updateResourcesDescription($tasks,
                $page_id_thumb_folder_path);
            array_shift($this->page_id_thumb_folder_paths);
        }
        file_put_contents(self::NEEDS_DESCRIPTION_FILE, "");
    }
    /**
     * Updates/finds descriptions for resources listed in a
     * needs_description.txt in a wiki pages thumb subfolder.
     * It does this by iterating over all configured description search sources
     * a until a match is found. It then saves the description in file at given
     * resource thumb folder path
     *
     * @param string $thumb_folder_path path to sub-folders
     *  needs_description.txt file
     * @param array $sources associative array containing details of all search
     *      sources
     * @param boolean $test_mode used to return string in test mode
     * @return string if $test_mode true
     */
    public function updateResourcesDescription($sources,
        $page_id_thumb_folder_path = "", $test_mode = false)
    {
        $page_name = "";
        $thumb_folder_path = "";
        $page_path_parts = explode(":", $page_id_thumb_folder_path, 2);
        if (count($page_path_parts) == 2) {
            list($page_id, $thumb_folder_path) = $page_path_parts;
            $group_model = new GroupModel();
            $page_info = $group_model->getPageInfoByPageId($page_id);
            $page_name = $page_info['PAGE_NAME'] ?? "";
        } else if (!$test_mode) {
            return false;
        }
        if (!$test_mode && !file_exists($thumb_folder_path)) {
            return false;
        }
        $test_results = "";
        $log_function = function ($msg, $log_tag = "div class='source-test'")
            use (&$test_results, $test_mode) {
            $close_tag= preg_split("/\s+/",$log_tag)[0];
            if ($test_mode) {
                $test_results .=
                    "<$log_tag style='overflow-x: scroll;'>$msg</$close_tag>\n";
            } else {
                L\crawlLog($msg);
            }
        };
        $file_path = $thumb_folder_path . "/needs_description.txt";
        if (!$test_mode) {
            $log_function("---- Processing file $file_path ----");
        }
        $resource_details = !$test_mode ?
            (file_exists($file_path) ? file($file_path) : [] ) :
            explode("\n", $sources[0]['TEST_DATA']);
        $i = 1;
        $resource_details = array_filter($resource_details);
        $resource_details_copy = $resource_details;
        foreach ($resource_details as $resource_detail) {
            $time = time();
            if (!$test_mode && $time - $this->update_time >= C\ONE_HOUR) {
                file_put_contents($file_path, implode(PHP_EOL,
                    $resource_details_copy));
                return true;
            }
            $log_function("Processing $i - $resource_detail", "h3");
            array_shift($resource_details_copy);
            $resource_detail = trim($resource_detail);
            $resource_name = trim(preg_replace('/\s+/', ' ', $resource_detail));
            $resource_name = pathinfo($resource_name)['filename'];
            $resource_name = preg_replace('/\s+/', '%20', $resource_name);
            $max_score = 0;
            $details_page_url = "";
            $found_details = false;
            foreach ($sources as $source) {
                $source_name = $source['NAME'];
                if ($this->matchResourceSourcePathTerms(
                    $page_name . "/". $thumb_folder_path . "/" .
                    $resource_detail, $source['CATEGORY'])){
                    $log_function("*** Using search source <b>$source_name" .
                        "</b> to find description ***", "p");
                    $search_page_url = $source['SOURCE_URL'] . $resource_name;
                    $log_function(" Search Page URL - $search_page_url", "pre");
                    $search_page = FetchUrl::getPage($search_page_url);
                    if (empty($search_page)) {
                        $log_function("<span class='red'>No search results".
                        "found for $resource_name</span>", "p");
                        continue;
                    }
                    set_error_handler(null);
                    $dom = L\getDomFromString($search_page);
                    $dom_xpath = new \DOMXPath($dom);
                    $items = @$dom_xpath->evaluate($source['ITEM_XPATH']);
                    set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
                    foreach ($items as $item) {
                        $processed_result = $this->processItem($item,
                            $resource_name, $source, $dom, $test_mode);
                        $test_results .= $processed_result[2];
                        if ($processed_result[0] > $max_score) {
                            $max_score = $processed_result[0];
                            $details_page_url = $processed_result[1];
                        }
                    }
                    if (!empty($details_page_url)) {
                        $log_function("<b>Selected Details Page URL - " .
                            "$details_page_url</b>", "pre");
                        $details_page = FetchUrl::getPage($details_page_url);
                        if (empty($details_page)) {
                            $log_function("<span class='red'>Details page".
                            " not available</span>", "p");
                            continue;
                        }
                        list($details, $test_info) = $this->getDetails(
                            $details_page, $source, $test_mode);
                        $test_results .= $test_info;
                        if (!empty($details)) {
                            $found_details = true;
                            if ($test_mode) {
                                $log_function("*** Found below details ***",
                                    "p");
                                $log_function("$details", "pre");
                            } else {
                                file_put_contents($thumb_folder_path .
                                    "/$resource_detail.txt", $details);
                            }
                            break;
                        }
                    }
                }
            }
            if (!$found_details && !$test_mode) {
                file_put_contents($thumb_folder_path .
                    "/$resource_detail.txt", "Description search sources".
                        " failed to find description.");
            }
            $i++;
        }
        if (!$test_mode) {
            file_put_contents($file_path, "");
        }
        return $test_mode ? $test_results : true;
    }
    /**
     * Checks if the terms wiki page name followed by a path to a wiki resource
     * contain the terms in a description search source string which would
     * trigger that search source to get used
     * @param string $page_name_folder_path path to check terms against search
     *  search source trigger terms. The mimetype of the resource is also added
     *  to the list of terms to check
     * @param string $source_term_string a comma separated list of terms
     *  used by a description source to see if it can supply a description of
     *  the given resource.
     * @return bool whether the path contained any of the source trigger terms
     */
    public function matchResourceSourcePathTerms($page_name_resource_path,
        $source_term_string)
    {
        $page_name_resource_path = preg_replace("/\_/u", " ",
            $page_name_resource_path);
        $source_term_string = preg_replace("/\_/u", " ",
            $source_term_string);
        $mime_type = L\mimeType($page_name_resource_path, true);
        $mime_type_major = explode("/", $mime_type)[0];
        $check_parts = explode("/", mb_strtolower($page_name_resource_path));
        $source_terms = preg_split("/\s*,\s*/u", trim(mb_strtolower(
            $source_term_string)));
        $check_parts[] = $mime_type;
        $check_parts[] = $mime_type_major;
        foreach($check_parts as $check_part) {
            if (in_array($check_part, $source_terms)) {
                return true;
            }
        }
        return false;
    }
    /**
     * Processes $item, a DOMElement representing a search result for
     * a description for the wiki resource $name, extracting a title and
     * url. Form the title a match score with $name is obtained. This score
     * and url as well as in test mode log messages are returned.
     *
     * @param $item DOMNode representing one possible description search result
     * @param $name the wiki resource name we are trying to get a description
     *  of
     * @param $source the source associative array with information about how
     *  to extract description from the current dom document and dom  node.
     * @param $dom DOMDocument of whole document node is from, used in
     *  creating DOMXpath object for quering $item.
     * @return array $score, $url, $test_results $score of $item as a likely
     *  source for a description for the wiki resource $name, $url
     *  that $item point to with more information, $test_results log messages
     *  if in test mode.
     */
    public function processItem($item, $name, $source, $dom, $test_mode = false)
    {
        if (!$item->hasChildNodes()) {
            return [0, null, ""];
        }
        $test_results = "";
        $log_function = function ($msg, $log_tag = "div class='source-test'")
            use (&$test_results, $test_mode) {
            $close_tag = preg_split("/\s+/",$log_tag)[0];
            if ($test_mode) {
                $test_results .=
                    "<$log_tag style='overflow-x: scroll;'>$msg</$close_tag>\n";
            } else {
                L\crawlLog($msg);
            }
        };
        $log_function("*** Processing item ***", "p");
        $dom_xpath = new \DOMXPath($dom);
        $title = "";
        if ($source['TITLE_XPATH'][0] == "/") {
            $source['TITLE_XPATH'] = "." . $source['TITLE_XPATH'];
        }
        $title_nodes = @$dom_xpath->evaluate($source['TITLE_XPATH'], $item);
        if (!empty($title_nodes) && !empty($title_nodes->item(0))) {
            $title = trim(mb_strtolower($title_nodes->item(0)->textContent));
            similar_text($name, $title, $score);
        }
        if ($source['URL_XPATH'][0] == "/") {
            $source['URL_XPATH'] = "." . $source['URL_XPATH'];
        }
        $url_nodes = @$dom_xpath->evaluate($source['URL_XPATH'], $item);
        $url_parts = parse_url($source['SOURCE_URL']);
        $base_url = $url_parts['scheme']."://".$url_parts['host'];
        if (!empty($url_nodes) && !empty($url_nodes->item(0))) {
            $url = $url_nodes->item(0)->textContent;
            $url = UrlParser::canonicalLink($url, $base_url);
        }
        if (!empty($title) && !empty($url)) {
            $log_function(" <b>Title:</b> $title", "pre");
            $log_function(" <b>URL:</b> $url", "pre");
            $log_function(" <b>Title Match Percentage:</b> $score", "pre");
            return [$score, $url, $test_results];
        }
        return [0, null, $test_results];
    }
    /**
     * Fetches the details on the url page using the xpaths
     * values configured in search source
     *
     * @param $page string the html string of the details page
     * @param $source array search source details
     * @return $details string details found using xpaths
     */
    public function getDetails($page, $source, $test_mode = false)
    {
        $test_results = "";
        $log_function = function ($msg, $log_tag = "div class='source-test'")
            use (&$test_results, $test_mode) {
            $close_tag= preg_split("/\s+/",$log_tag)[0];
            if ($test_mode) {
                $test_results .=
                    "<$log_tag style='overflow-x: scroll;'>$msg</$close_tag>\n";
            } else {
                L\crawlLog($msg);
            }
        };
        set_error_handler(null);
        $dom = L\getDomFromString($page);
        $details = "";
        set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
        if (empty($dom)) {
            $log_function("<span class='red'>Error creating DOM</span>",
                "pre");
            return $details;
        }
        $detail_items = explode("\n", $source['AUX_INFO']);
        $details = "";
        foreach ($detail_items as $detail_item) {
            $info_found = false;
            $sub_details = "";
            list($detail_name, $detail_xpath, ) = explode("|", $detail_item);
            $detail_name = trim($detail_name);
            $dom_xpath = new \DOMXPath($dom);
            $detail_nodes = @$dom_xpath->evaluate($detail_xpath);
            if (!empty($detail_nodes)) {
                if (is_string($detail_nodes)) {
                    $sub_details .= trim(preg_replace('/\s+/', ' ',
                        $detail_nodes)) . "\n";
                    $info_found = true;
                } else {
                    foreach ($detail_nodes as $detail_node) {
                        if (!empty($detail_node->textContent)) {
                            $entry_text = trim(preg_replace('/\s+/', ' ',
                                $detail_node->textContent)) . "\n";
                            if (!str_contains($sub_details, $entry_text)) {
                                $sub_details .= $entry_text;
                                $info_found = true;
                            }
                        }
                    }
                }
            }
            if (!$info_found) {
                $log_function("<span class='red'>Could not fetch value".
                    " for <b>$detail_name</b></span>", "pre");
            } else {
                $details .= "= $detail_name =\n" . $sub_details . "\n";
            }
        }
        return [wordwrap($details), $test_results];
    }
}

ViewGit