Last commit for src/library/ScraperManager.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]

Adjust copyrights years

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Charles Bocage (charles.bocage@sjsu.edu)
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\library;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;

/**
 * Class used by html processors to detect if a page matches a particular
 * signature such as that of a content management system, and
 * also to provide scraping mechanisms for the content of such a page
 *
 * @author Charles Bocage (charles.bocage@sjsu.edu) updated to support
 *      scraper priorities and extract fields Chris Pollett
 */
class ScraperManager
{
    /**
     * Method used to check a page against a supplied list of scrapers
     * for a matching signature. If a match is found that scraper is returned.
     *
     * @param string $page the html page to check
     * @param array $scrapers an array of scrapers to check against
     * @return array an associative array of scraper properties if a matching
     *      scraper signature found; otherwise, the empty array
     */
    public static function getScraper($page, $scrapers)
    {
        $out_scraper = [];
        $out_priority = -1;
        foreach ($scrapers as $scraper) {
            if (empty($scraper)) {
                continue;
            }
            $signature = html_entity_decode(
                $scraper['SIGNATURE'], ENT_QUOTES);
            if (self::checkSignature($page, $signature) &&
                $scraper['PRIORITY'] > $out_priority) {
                $out_scraper = $scraper;
                $out_priority = $scraper['PRIORITY'];
            }
        }
        foreach ($out_scraper as $key => $value) {
            $out_scraper[$key] = html_entity_decode($value, ENT_QUOTES);
        }
        return $out_scraper;
    }
    /**
     * Applies scrape rules to a given page. A scrape rule consists of
     * TEXT_PATH xpath for the main content of a web page, a sequence of
     * \n separated DELETE_PATHS for what should be removed from the
     * main content as irrelevant, and finally a list EXTRACT_FIELDS
     * of additional summary fields which should be extracted from the
     * page content
     *
     * @param string $page the web page to operate on
     * @param array a scraper object to apply the rules of
     * @return string the result of extracting first xpath content and
     *  deleting from it according to the remaining xpath rules
     */
    public static function applyScraperRules($page, $scraper)
    {
        $delete_paths = explode("\n", $scraper["DELETE_PATHS"]);
        $dom = self::getContentByXquery($page, $scraper["TEXT_PATH"]);
        $summary = [];
        $scraper_apply_info = [];
        if (!empty($dom)) {
            $scraper_apply_info["GET_CONTENT_SUCCEEDED"] = true;
            foreach ($delete_paths as $tag_to_remove) {
                $did_remove_stuff =
                    self::removeContentByXquery($dom, $tag_to_remove);
                if ($did_remove_stuff) {
                    $scraper_apply_info["TRIGGERED_REMOVALS"][] =
                        $tag_to_remove;
                }
                if (empty($dom)) {
                    break;
                }
            }
            if (!empty($dom)) {
                $out_page = utf8SafeSaveHtml($dom);
            }
        } else {
            $scraper_apply_info["GET_CONTENT_SUCCEEDED"] = false;
        }
        $summary[L\CrawlConstants::SCRAPER_INFO] = $scraper_apply_info;
        set_error_handler(null);
        $extract_fields = explode("\n", $scraper["EXTRACT_FIELDS"]);
        $results = false;
        $dom = new \DOMDocument();
        if (!empty($page) && @$dom->loadHTML($page)) {
            foreach ($extract_fields as $extract_field) {
                if (preg_match('/^([^\=\:]+)(\:?\=)(.+)$/', $extract_field,
                    $parts)) {
                    list(, $summary_field, $assign_type, $value_or_query) =
                        $parts;
                    $summary_field = trim($summary_field);
                    $value_or_query = trim($value_or_query);
                    if (defined(C\NS_LIB . "CrawlConstants::" .$summary_field)){
                        $summary_field = constant(
                            C\NS_LIB . "CrawlConstants::" .$summary_field);
                    }
                    $out_text = "";
                    if ($assign_type == ":=") {
                        $out_text = $value_or_query;
                    } else {
                        if (preg_match("@^(r(:?egex)?)/@", $value_or_query,
                            $field_matches)) {
                            $field_regex = substr($value_or_query,
                                strlen($field_matches[1]));
                            $out_text = preg_match($field_regex, $page);
                        } if ($xpath = new \DOMXpath($dom)) {
                            $results = @$xpath->query($value_or_query);
                            if (!empty($results) && $results->length > 0) {
                                $len = $results->length;
                                for ($i = 0; $i < $len; $i++) {
                                    $out_text .=
                                        $results->item($i)->textContent;
                                }
                            }
                        }
                    }
                    $summary[$summary_field] = trim($out_text);
                }
            }
        }
        set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
        $out_page = empty($out_page) ? $page : $out_page;
        return [$summary, $out_page];
    }
    /**
     * If $signature begins with '/', checks to see if applying
     * the xpath in $signature to $page results
     * in a non-empty dom node list. Otherwise, does a match of the
     * regex (without matching start and end delimiters (say, /)
     * against $page and returns whether found
     *
     * @param string $page a web document to check
     * @param string $signature an xpath to check against
     * @return boolean true if the given xpath return a non empty dom node list
     */
    public static function checkSignature($page, $signature)
    {
        if ($signature[0] == '/') {
            $dom = new \DOMDocument();
            $results = false;
            set_error_handler(null);
            if (!empty($page) && @$dom->loadHTML($page)) {
                if ($xpath = new \DOMXpath($dom)) {
                    $results = $xpath->query($signature);
                }
            }
            set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
            return !empty($results->length) && $results->length > 0;
        } else {
            return (mb_ereg($signature, $page) !== false);
        }
    }
    /**
     * Get the contents of a document via an xpath
     * @param string $page a document to apply the xpath query against
     * @param string $query the xpath query to run
     *
     * @return \DOMDocument dom of a simplified web page containing nodes
     *      matching xpath query within an html body tag.
     */
    public static function getContentByXquery($page, $query)
    {
        $out_dom = null;
        $dom = L\getDomFromString($page);
        if (!empty($dom->documentElement) && !empty($query)) {
            // if getContentByXquery succeeds we also copy over head of doc
            $head_xpath = new \DOMXPath($dom);
            $head_xpath_result = $head_xpath->query("//head");
            $xpath = new \DOMXPath($dom);
            $xpath_result = $xpath->query($query);
            if (!empty($xpath_result) && $xpath_result->length > 0) {
                $out_dom = new \DOMDocument();
                $out_dom->loadHTML("<html><head></head><body></body></html>");
                if (!empty($head_xpath_result) &&
                    $head_xpath_result->length > 0) {
                    $head_node = $out_dom->importNode(
                        $head_xpath_result->item(0), true);
                    $out_dom->documentElement->replaceChild($head_node,
                        $out_dom->documentElement->childNodes->item(0));
                }
                $node = $out_dom->importNode($xpath_result->item(0), true);
                $out_dom->documentElement->childNodes->item(1)->appendChild(
                    $node);
            }
        }
        return $out_dom;
    }
    /**
     * Removes from the contents of a DOMDocument the results of
     * an xpath query
     * @param \DOMDocument $dom a document to apply the xpath query against
     * @param string $query the xpath query to run
     * @return bool whether anything was removed from the DOMDocument
     */
    public static function removeContentByXquery($dom, $query)
    {
        if (empty($dom) || empty($query)) {
            return false;
        }
        $did_remove_stuff = false;
        $xpath = new \DOMXPath($dom);
        $xpath_result = $xpath->query($query);
        if ($xpath_result->length > 0) {
            $len = $xpath_result->length;
            for ($i = 0; $i < $len; $i++) {
                $node = $xpath_result->item($i);
                $parent = $node->parentNode;
                if ($parent) {
                    $parent->removeChild($node);
                    $did_remove_stuff = true;
                }
            }
        }
        return $did_remove_stuff;
    }
}

ViewGit