Last commit for src/library/ScraperManager.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]

Adjust copyrights years

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2016  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Charles Bocage (charles.bocage@sjsu.edu)
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2016
 * @filesource
 */
namespace seekquarry\yioop\library;

/**
 * Class used by html processors to detect if a page was made by a content
 * management system, and also to provide scraping mechanisms for the content
 * of such a page
 *
 * @author Charles Bocage (charles.bocage@sjsu.edu)
 */
class ScraperManager
{
    /**
     * Method used to check a page against a supplied list of scrapers
     * for a matching signature. If a match is found that scraper is returned.
     *
     * @param string $page the html page to check
     * @param array $scrapers an array of scrapers to check against
     * @return array an associative array of scraper properties if a matching
     *      scraper signature found; otherwise, the empty array
     */
    public static function getScraper($page, $scrapers)
    {
        $out_scraper = [];
        foreach ($scrapers as $scraper) {
            $out_scraper['SIGNATURE'] = html_entity_decode(
                $scraper['SIGNATURE'], ENT_QUOTES);
            if (self::checkSignature($page, $out_scraper['SIGNATURE'])) {
                $out_scraper['ID'] = $scraper['ID'];
                $out_scraper['SCRAPE_RULES'] = html_entity_decode(
                    $scraper['SCRAPE_RULES'], ENT_QUOTES);;
                $out_scraper['NAME'] = $scraper['NAME'];
                break;
            }
        }
        return $out_scraper;
    }
    /**
     * Applies scrape rules to a given page. A scrape rule consists of
     * a sequence of xpaths delimited by ###. The first path is used
     * extract content from the page, the remaining xpaths are used
     * to delete content from the result.
     *
     * @param string $page the html page to operate on
     * @param string $scrape_rules_string a string of xpaths with ###
     *  used as a delimeter
     * @return string the result of extracting first xpath content and
     *  deleting from it according to the remaining xpath rules
     */
    public static function applyScraperRules($page, $scrape_rules_string)
    {
        $scrape_rules = preg_split('/###/u',
            $scrape_rules_string, 0, PREG_SPLIT_NO_EMPTY);
        if (count($scrape_rules) > 0) {
            $temp_page = self::getContentByXquery($page,
                $scrape_rules[0]);
            unset($scrape_rules[0]);
            if (!empty($temp_page)) {
                foreach ($scrape_rules as $tag_to_remove) {
                    $temp_content_to_remove =
                        self::getContentByXquery($page, $tag_to_remove);
                    $temp_page =
                        str_replace($temp_content_to_remove, "", $temp_page);
                }
            }
        }
        return empty($temp_page) ? $page : $temp_page;
    }
    /**
     * If $signature begins with '/', checks to see if applying
     * the xpath in $signature to $page results
     * in a non-empty dom node list. Otherwise, does a match of the
     * regex (without matching start and end delimiters (say, /)
     * against $page and returns whether found
     *
     * @var string $page a web document to check
     * @var string $signature an xpath to check against
     * @return boolean true if the given xpath return a non empty dom node list
     */
    public static function checkSignature($page, $signature)
    {
        if ($signature[0] == '/') {
            $dom = new \DOMDocument();
            @$dom->loadHTML($page);
            $xpath = new \DOMXpath($dom);
            $results = $xpath->query($signature);
            return !empty($results->length) || $results->length > 0;
        } else {
            $regex = "/" . preg_quote($signature) . "/";
            return (preg_match($regex, $page) == 1);
        }
    }
    /**
     * Get the contents of a document via an xpath
     * @param string $page a document to apply the xpath query against
     * @param string $query the xpath query to run
     *
     * @return string the content found as a string, otherwise an empty string
     */
    public static function getContentByXquery($page, $query)
    {
        $result = "";
        $dom = new \DOMDocument();
        if (@$dom->loadHTML($page)) {
            $xpath = new \DOMXPath($dom);
            $tempResult = $xpath->query($query);
            if ($tempResult->length > 0) {
                $result = $dom->saveHTML($tempResult->item(0));
            }
        }
        return $result;
    }
}

ViewGit