Last commit for src/library/ScraperManager.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]

Adjust copyrights years

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2018  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Charles Bocage (charles.bocage@sjsu.edu)
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2018
 * @filesource
 */
namespace seekquarry\yioop\library;

use seekquarry\yioop\configs as C;
/**
 * Class used by html processors to detect if a page matches a particular
 * signature such as that of a content management system, and
 * also to provide scraping mechanisms for the content of such a page
 *
 * @author Charles Bocage (charles.bocage@sjsu.edu)
 */
class ScraperManager
{
    /**
     * Method used to check a page against a supplied list of scrapers
     * for a matching signature. If a match is found that scraper is returned.
     *
     * @param string $page the html page to check
     * @param array $scrapers an array of scrapers to check against
     * @return array an associative array of scraper properties if a matching
     *      scraper signature found; otherwise, the empty array
     */
    public static function getScraper($page, $scrapers)
    {
        $out_scraper = [];
        foreach ($scrapers as $scraper) {
            if (empty($scraper)) {
                continue;
            }
            $signature = html_entity_decode(
                $scraper['SIGNATURE'], ENT_QUOTES);
            if (self::checkSignature($page, $signature)) {
                $out_scraper['SIGNATURE'] = $signature;
                $out_scraper['ID'] = $scraper['ID'];
                $out_scraper['SCRAPE_RULES'] = html_entity_decode(
                    $scraper['SCRAPE_RULES'], ENT_QUOTES);
                $out_scraper['NAME'] = $scraper['NAME'];
                break;
            }
        }
        return $out_scraper;
    }
    /**
     * Applies scrape rules to a given page. A scrape rule consists of
     * a sequence of xpaths delimited by ###. The first path is used
     * extract content from the page, the remaining xpaths are used
     * to delete content from the result.
     *
     * @param string $page the html page to operate on
     * @param string $scrape_rules_string a string of xpaths with ###
     *  used as a delimeter
     * @return string the result of extracting first xpath content and
     *  deleting from it according to the remaining xpath rules
     */
    public static function applyScraperRules($page, $scrape_rules_string)
    {
        $scrape_rules = preg_split('/###/u',
            $scrape_rules_string, 0, PREG_SPLIT_NO_EMPTY);
        if (count($scrape_rules) > 0) {
            $temp_page = self::getContentByXquery($page,
                $scrape_rules[0]);
            unset($scrape_rules[0]);
            if (!empty($temp_page)) {
                foreach ($scrape_rules as $tag_to_remove) {
                    $new_temp_page =
                        self::removeContentByXquery($temp_page, $tag_to_remove);
                    if (!empty($new_temp_page)) {
                        $temp_page = $new_temp_page;
                    }
                }
            }
        }
        return empty($temp_page) ? $page : $temp_page;
    }
    /**
     * If $signature begins with '/', checks to see if applying
     * the xpath in $signature to $page results
     * in a non-empty dom node list. Otherwise, does a match of the
     * regex (without matching start and end delimiters (say, /)
     * against $page and returns whether found
     *
     * @param string $page a web document to check
     * @param string $signature an xpath to check against
     * @return boolean true if the given xpath return a non empty dom node list
     */
    public static function checkSignature($page, $signature)
    {
        if ($signature[0] == '/') {
            $dom = new \DOMDocument();
            $results = false;
            set_error_handler(null);
            if (@$dom->loadHTML($page)) {
                if ($xpath = new \DOMXpath($dom)) {
                    $results = $xpath->query($signature);
                }
            }
            set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
            return !empty($results->length) && $results->length > 0;
        } else {
            return (mb_ereg($signature, $page) !== false);
        }
    }
    /**
     * Get the contents of a document via an xpath
     * @param string $page a document to apply the xpath query against
     * @param string $query the xpath query to run
     *
     * @return string the content found as a string, otherwise an empty string
     */
    public static function getContentByXquery($page, $query)
    {
        $result = "";
        $dom = new \DOMDocument();
        set_error_handler(null);
        if (@$dom->loadHTML($page)) {
            $xpath = new \DOMXPath($dom);
            $xpath_result = $xpath->query($query);
            if (!empty($xpath_result) && $xpath_result->length > 0) {
                $result = $dom->saveHTML($xpath_result->item(0));
            }
        }
        set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
        return $result;
    }
    /**
     * Removes from the contents of a document the results of
     * an xpath query
     * @param string $page a document to apply the xpath query against
     * @param string $query the xpath query to run
     *
     * @return string the content less the xpath results as an HTML document
     */
    public static function removeContentByXquery($page, $query)
    {
        $result = $page;
        $dom = new \DOMDocument();
        set_error_handler(null);
        if (@$dom->loadHTML($page)) {
            $xpath = new \DOMXPath($dom);
            $xpath_result = $xpath->query($query);
            if ($xpath_result->length > 0) {
                $len = $xpath_result->length;
                for ($i = 0; $i < $len; $i++) {
                    $node = $xpath_result->item($i);
                    $parent = $node->parentNode;
                    if ($parent) {
                        $parent->removeChild($node);
                    }
                }
                $result = $dom->saveHTML();
            }
        }
        set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
        return $result;
    }
}

ViewGit