Last commit for src/library/processors/EpubProcessor.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]

Adjust copyrights years

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Vijeth Patil vijeth.patil@gmail.com
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\library\processors;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library\UrlParser;
use seekquarry\yioop\library\PartialZipArchive;

/**
 * Used to create crawl summary information
 * for XML files (those served as application/epub+zip)
 *
 * @author Vijeth Patil
 */
class EpubProcessor extends TextProcessor
{
    /**
     * The constant represents the number of
     * child levels at which the data is present in
     * the content.opf file.
     */
    const MAX_DOM_LEVEL = 15;
    /**
     * Set-ups the any indexing plugins associated with this page
     * processor
     *
     * @param array $plugins an array of indexing plugins which might
     *     do further processing on the data handles by this page
     *     processor
     * @param int $max_description_len maximal length of a page summary
     * @param int $max_links_to_extract maximum number of links to extract
     *      from a single document
     * @param string $summarizer_option CRAWL_CONSTANT specifying what kind
     *      of summarizer to use self::BASIC_SUMMARIZER,
     *      self::GRAPH_BASED_SUMMARIZER and self::CENTROID_SUMMARIZER
     *      self::CENTROID_SUMMARIZER
     */
    public function __construct($plugins = [], $max_description_len = null,
        $max_links_to_extract = null,
        $summarizer_option = self::BASIC_SUMMARIZER)
    {
        parent::__construct($plugins, $max_description_len,
            $max_links_to_extract, $summarizer_option);
        /** Register File Types We Handle*/
        self::$indexed_file_types[] = "epub";
        self::$mime_processor["application/epub+zip"] = "EpubProcessor";
    }
    /**
     * The name of the tag element in an xml document
     *
     * @var string name
     */
    public $name;
    /**
     * The attribute of the tag element in an xml document
     *
     * @var string attributes
     */
    public $attributes;
    /**
     * The content of the tag element or attribute, used to extract
     * the fields like title, creator, language of the document
     *
     * @var string content
     */
    public $content;
    /**
     * The child tag element of a tag element.
     *
     * @var string children
     */
    public $children;
    /**
     * Used to extract the title, description and links from
     * a string consisting of ebook publication data.
     *
     * @param string $page epub contents
     * @param string $url the url where the page contents came from,
     *    used to canonicalize relative links
     *
     * @return array  a summary of the contents of the page
     *
     */
    public function process($page, $url)
    {
        $summary = null;
        $opf_pattern = "/.opf$/i";
        $html_pattern  = "/.html$/i";
        $xhtml_pattern = "/.xhtml$/i";
        $epub_url[0] = '';
        $epub_language = '';
        $epub_title = '';
        $epub_unique_identifier = '';
        $epub_author = '';
        $epub_publisher = '';
        $epub_date = '';
        $epub_subject = '';
        $desc = '';
        $htmlcontent = '';
        // Open a zip archive
        $zip = new PartialZipArchive($page);
        $num_files = $zip->numFiles();
        for ($i = 0; $i < $num_files; $i++) {
            // get the content file names of .epub document
            $filename[$i] = $zip->getNameIndex($i);
            if (preg_match($opf_pattern, $filename[$i])) {
                // Get the file data from zipped folder
                $opf_data = $zip->getFromName($filename[$i]);
                $opf_summary = $this->xmlToObject($opf_data);
                for ($m = 0; $m <= self::MAX_DOM_LEVEL; $m++) {
                    for ($n = 0; $n <= self::MAX_DOM_LEVEL; $n++) {
                        if (isset($opf_summary->children[$m]->children[$n])){
                            $child = $opf_summary->children[$m]->
                                children[$n];
                            if ( isset($child->name) &&
                                $child->name == "dc:language") {
                                $epub_language =
                                    $opf_summary->children[$m]->
                                        children[$n]->content ;
                            }
                            if ( ($opf_summary->children[$m]->children[$n]->
                                name) == "dc:title") {
                                $epub_title = $opf_summary->children[$m]->
                                    children[$n]->content;
                            }
                            if ( ($opf_summary->children[$m]->children[$n]->
                                name) == "dc:creator") {
                                $epub_author = $opf_summary->children[$m]->
                                    children[$n]->content ;
                            }
                            if ( ($opf_summary->children[$m]->children[$n]->
                                name) == "dc:identifier") {
                                $epub_unique_identifier = $opf_summary->
                                    children[$m]->children[$n]->content ;
                            }
                        }
                    }
                }
            } else if ((preg_match($html_pattern, $filename[$i])) ||
                (preg_match($xhtml_pattern, $filename[$i]))) {
                $html = new HtmlProcessor();
                $html_data = $zip->getFromName($filename[$i]);
                $description[$i] = $html->process($html_data, $url);
                $htmlcontent.= $description[$i]['t'];
            }
        }
        if ($epub_title != '') {
            $desc= " $epub_title .";
        }
        if ($epub_author != '') {
            $desc = $desc." $epub_author ";
        }
        if ($epub_language != '') {
            $desc = $desc." $epub_language ";
        }
        if ($epub_unique_identifier != '') {
            $desc = $desc." URN-".
            $epub_unique_identifier.".";
        }
        if ($epub_publisher != '') {
            $desc = $desc." $epub_publisher ";
        }
        if ($epub_date != '') {
            $desc = $desc." $epub_date ";
        }
        if ($epub_subject != '') {
            $desc = $desc." $epub_subject ";
        }
        $desc= $desc.$htmlcontent;
        //restrict the length of the description to maximum description length
        if (strlen($desc) > self::$max_description_len) {
            $desc = substr($desc, 0, self::$max_description_len);
        }
        $summary[self::TITLE] = $epub_title;
        $summary[self::DESCRIPTION] = $desc;
        $summary[self::LANG] = $epub_language;
        $summary[self::LINKS] = $epub_url;
        $summary[self::PAGE] = $page;
        return $summary;
    }
    /**
     * Used to extract the DOM tree containing the information
     * about the epub file such as title, author, language, unique
     * identifier of the book from a string consisting of ebook publication
     * content OPF file.
     *
     * @param string $xml page contents
     *
     * @return array an information about the contents of the page
     *
     */
    public function xmlToObject($xml)
    {
        $parser = xml_parser_create();
        xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, 0);
        xml_parser_set_option($parser, XML_OPTION_SKIP_WHITE, 1);
        xml_parse_into_struct($parser, $xml, $tags);
        xml_parser_free($parser);
        $elements = [];  // the currently filling [child] XmlElement array
        $stack = [];
        foreach ($tags as $tag) {
            $index = count($elements);
            if ($tag['type'] == "complete" || $tag['type'] == "open") {
                $elements[$index] = new EpubProcessor;
                $elements[$index]->name = $tag['tag'];
                if (isset($tag['attributes'])) {
                    $elements[$index]->attributes = $tag['attributes'];
                }
                if (isset($tag['value'])) {
                    $elements[$index]->content = $tag['value'];
                }
                if ($tag['type'] == "open") {  // push
                    $elements[$index]->children = [];
                    $stack[count($stack)] = &$elements;
                    $elements = &$elements[$index]->children;
                }
            }
            if ($tag['type'] == "close") {  // pop
                $elements = &$stack[count($stack) - 1];
                unset($stack[count($stack) - 1]);
            }
        }
        return $elements[0];  // the single top-level element
    }
}

ViewGit