Last commit for src/library/processors/RssProcessor.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]

Adjust copyrights years

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2018  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2018
 * @filesource
 */
namespace seekquarry\yioop\library\processors;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library\UrlParser;

/**
 * Used to create crawl summary information
 * for RSS or Atom files
 *
 * @author Chris Pollett
 */
class RssProcessor extends TextProcessor
{
    /**
     * Set-ups the any indexing plugins associated with this page
     * processor
     *
     * @param array $plugins an array of indexing plugins which might
     *     do further processing on the data handles by this page
     *     processor
     * @param int $max_description_len maximal length of a page summary
     * @param string $summarizer_option CRAWL_CONSTANT specifying what kind
     *      of summarizer to use self::BASIC_SUMMARIZER,
     *      self::GRAPH_BASED_SUMMARIZER and self::CENTROID_SUMMARIZER
     *      self::CENTROID_SUMMARIZER
     */
    public function __construct($plugins = [], $max_description_len = null,
        $summarizer_option = self::BASIC_SUMMARIZER)
    {
        parent::__construct($plugins, $max_description_len, $summarizer_option);
        /** Register File Types We Handle*/
        self::$indexed_file_types[] = "rss";
        self::$mime_processor["application/rss+xml"] = "RssProcessor";
        self::$mime_processor["application/atom+xml"] = "RssProcessor";
    }
    /**
     * Used to extract the title, description and links from
     * a string consisting of rss or atom news feed data.
     *
     * @param string $page   web-page contents
     * @param string $url   the url where the page contents came from,
     *    used to canonicalize relative links
     *
     * @return array  a summary of the contents of the page
     *
     */
    public function process($page, $url)
    {
        $summary = null;
        if (is_string($page)) {
            $dom = self::dom($page);
            $atom = false;
            $feed_nodes = $dom->getElementsByTagName('feed');
            if ($feed_nodes->length > 0) {
                $atom = true;
            }
            if ($dom !==false) {
                $summary[self::TITLE] = self::title($dom, $atom);
                $summary[self::DESCRIPTION] = self::description($dom, $atom);
                $summary[self::LANG] = self::lang($dom,
                    $summary[self::DESCRIPTION]);
                $summary[self::LINKS] = self::links($dom, $url, $atom);
                if (strlen($summary[self::DESCRIPTION] . $summary[self::TITLE])
                    == 0 && count($summary[self::LINKS]) == 0) {
                    //maybe not rss or atom? treat as text still try to get urls
                    $summary = parent::process($page, $url);
                }
            }
        }
        return $summary;
    }
    /**
     * Determines the language of the rss document by looking at the channel
     * language tag
     *
     * @param object $dom - a document object to check the language of
     * @param string $sample_text sample text to try guess the language from
     * @param string $url guess lang from url as fallback
     *
     * @return string language tag for guessed language
     */
    public static function lang($dom, $sample_text = null, $url = null)
    {
        $xpath = new \DOMXPath($dom);
        $languages = $xpath->evaluate("/rss/channel/language");
        if ($languages && is_object($languages) &&
            is_object($languages->item(0))) {
            return $languages->item(0)->textContent;
        } else {
            $lang = self::calculateLang($sample_text, $url);
        }
        return $lang;
    }
    /**
     * Return a document object based on a string containing the contents of
     * an RSS page
     *
     * @param string $page   a web page
     *
     * @return object  document object
     */
    public static function dom($page)
    {
        $dom = new \DOMDocument();
        @$dom->loadXML($page);
        return $dom;
    }
    /**
     * Returns html head title of a webpage based on its document object
     *
     * @param object $dom   a document object to extract a title from.
     * @param bool $atom if the feed is atom or rss
     * @return string  a title of the page
     *
     */
    public static function title($dom, $atom = false)
    {
        $sites = [];
        $xpath = new \DOMXPath($dom);
        if ($atom){
            $xpath->registerNamespace('atom', "http://www.w3.org/2005/Atom");
        }
        $title_query = ($atom) ? "/feed/title|/atom:feed/atom:title" :
            "/rss/channel/title";
        $titles = $xpath->evaluate($title_query);
        $title = "";
        foreach ($titles as $pre_title) {
            $title .= $pre_title->textContent;
        }
        return $title;
    }
    /**
     * Returns descriptive text concerning a webpage based on its document
     * object
     *
     * @param object $dom   a document object to extract a description from.
     * @param bool $atom if the feed is atom or rss
     * @return string a description of the page
     */
    public static function description($dom, $atom = false) {
        $sites = [];
        $xpath = new \DOMXPath($dom);
        $description = "";
        /*
          concatenate the contents of these dom elements up to
          the limit of description length
        */
        $page_parts = ["/rss/channel/description",
            "/rss/channel/category", "/rss/channel/lastBuildDate",
            "/rss/channel/copyright"];
        if ($atom) {
            $xpath->registerNamespace('atom', "http://www.w3.org/2005/Atom");
            $page_parts = ["/feed/subtitle", "/feed/author",
                "/feed/updated", "/feed/rights", "/feed/generator",
                "/atom:feed/atom:subtitle", "/atom:feed/atom:author",
                "/atom:feed/atom:updated", "/atom:feed/atom:rights",
                "/atom:feed/atom:generator"];
        }
        foreach ($page_parts as $part) {
            $doc_nodes = $xpath->evaluate($part);
            foreach ($doc_nodes as $node) {
                $description .= " ".$node->textContent;
                if (strlen($description) > self::$max_description_len) {
                    break 2;
                }
            }
        }
        $description = mb_ereg_replace("(\s)+", " ",  $description);
        return $description;
    }
    /**
     * Returns up to MAX_LINK_PER_PAGE many links from the supplied
     * dom object where links have been canonicalized according to
     * the supplied $site information.
     *
     * @param object $dom   a document object with links on it
     * @param string $site   a string containing a url
     * @param bool $atom if the feed is atom or rss
     *
     * @return array   links from the $dom object
     */
    public static function links($dom, $site, $atom = false)
    {
        $sites = [];
        $xpath = new \DOMXPath($dom);
        $link_nodes = [
            "/rss/channel" => [ "url" =>"link", "text" => "title"],
            "/rss/channel/image" => [ "url" =>"url", "text" => "title"],
            "/rss/channel/item" => [ "url" =>"link", "text" => "title"],
        ];
        if ($atom) {
            $xpath->registerNamespace('atom', "http://www.w3.org/2005/Atom");
            $link_nodes = [
                "/feed/entry" => [ "url" =>"link", "text" => "title"],
                "/atom:feed/atom:entry"
                    => [ "url" =>"link", "text" => "title"],
            ];
        }
        $i = 0;
        foreach ($link_nodes as $path => $url_text_pair) {
            $nodes = $xpath->evaluate($path);
            foreach ($nodes as $node) {
                $result = self::linkAndTexts($node,
                    $url_text_pair['url'], $url_text_pair['text'], $site,
                    $atom);
                if ($result != false) {
                    list($url, $text) = $result;
                    $sites[$url] = $text;
                    $i++;
                }
                if ($i >= C\MAX_LINKS_TO_EXTRACT) {
                    break 2;
                }
            }
        }
       return $sites;
    }
    /**
     * Returns a url text pair where the url comes from the link of
     * the given item node and the text comes from the text data for that node.
     * urls are canonicalized according to site.
     *
     * @param object $item_node the DOMNode to get a link and text from
     * @param string $link_name name of link tag
     * @param string $text_name name of text tag to associate with link
     * @param string $site   a string containing a url
     * @param bool $atom if the feed is atom or rss
     *
     * @return array a url,text pair
     */
    public static function linkAndTexts($item_node, $link_name, $text_name,
        $site, $atom = false)
    {
        $text = "";
        foreach ($item_node->childNodes as $node) {
            if ($node->nodeName == $link_name) {
                if (!$atom) {
                    $url = UrlParser::canonicalLink(
                        $node->textContent, $site);
                } else {
                    $url = UrlParser::canonicalLink(
                        $node->getAttribute("href"), $site);
                }
                if ($url === null || $url === "" ||
                    UrlParser::checkRecursiveUrl($url) ||
                    strlen($url) >= C\MAX_URL_LEN) {
                    return false;
                }
            }
            if ($node->nodeName == $text_name) {
                $text = $node->textContent;
                if ($text == "") {
                    $text = "RSS Feed";
                    if ($atom) {
                        $text = "Atom Feed";
                    }
                }
            }
        }
        if (!isset($url) || $url == "") return false;
        $text = mb_ereg_replace("(\s)+", " ",  $text);
        return [$url, $text];
    }
}

ViewGit