Last commit for src/library/summarizers/ScrapeSummarizer.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2018  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Charles Bocage charles.bocage@sjsu.edu
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2018
 * @filesource
 */
namespace seekquarry\yioop\library\summarizers;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library\CrawlConstants;
use seekquarry\yioop\library\PhraseParser;
use seekquarry\yioop\library\processors\TextProcessor;
use seekquarry\yioop\library\processors\HtmlProcessor;

/**
 * Class which may be used by TextProcessors to get a summary for a text
 * document that may later be used for indexing.
 *
 * @author Charles Bocage charles.bocage@sjsu.edu
 */
class ScrapeSummarizer extends Summarizer
{
    /**
     * whether to output the results to the disk or not
     */
    const OUTPUT_TO_FILE = false;
    /**
     * The full disk location to save the result to
     */
    const OUTPUT_FILE_PATH = "/temp/scrape_summarizer_result.txt";
    /**
     * Scrapes the web document for important tags to make a summary
     *
     * @param object $dom   a document object to extract a description from.
     * @param string $page original page string to extract description from
     * @param string $lang language of the page to decide which stop words to
     *     call proper tokenizer.php of the specified language.
     *
     * @return array a pair (string summary, empty array word cloud)
     */
    public static function getSummary($dom, $page, $lang)
    {
        return [self::description($dom, $page, $lang), []];
    }
    /**
     * Returns descriptive text concerning a webpage based on its document
     * object
     *
     * @param object $dom   a document object to extract a description from.
     * @param string $page original page string to extract description from
     * @param string $lang language of the page to decide which stop words to
     *     call proper tokenizer.php of the specified language.
     * @return string a description of the page
     */
    public static function description($dom, $page, $lang)
    {
        $xpath = new \DOMXPath($dom);
        $metas = $xpath->evaluate("/html//meta");
        $description = "";
        $output_file_contents = "";
        //look for a meta tag with a description
        foreach ($metas as $meta) {
            if (stristr($meta->getAttribute('name'), "description")) {
                $description .= " .. ".$meta->getAttribute('content');
            }
        }
        if (TextProcessor::$max_description_len > strlen($page)/1.1) {
            /* if don't need to summarize much, take meta description
               from above code, then concatenate body of doc
               after stripping tags, return result
             */
            $description .= "\n" . HtmlProcessor::crudeDescription($page);
            if (self::OUTPUT_TO_FILE) {
                file_put_contents(C\WORK_DIRECTORY . self::OUTPUT_FILE_PATH,
                    $description);
            }
            return $description;
        }
        /*
          concatenate the contents of then additional dom elements up to
          the limit of description length. Choose tags in order of likely
          importance to this doc
        */
        $page_parts = ["//p[1]",
            "//div[1]", "//p[2]", "//div[2]", "//p[3]",
            "//div[3]", "//p[4]", "//div[4]",
            "//td", "//li", "//dt", "//dd",
            "//pre", "//a", "//article",
            "//section", "//cite"];
        $para_data = [];
        $len = 0;
        foreach ($page_parts as $part) {
            $doc_nodes = $xpath->evaluate($part);
            foreach ($doc_nodes as $node) {
                if ($part == "//a") {
                    $content = $node->getAttribute('href')." = ";
                    $add_len  = min(TextProcessor::$max_description_len / 2,
                        mb_strlen($content));
                    $para_data[$add_len][] = mb_substr($content, 0, $add_len);
                }
                $node_text = HtmlProcessor::domNodeToString($node);
                $add_len  = min(TextProcessor::$max_description_len / 2,
                    mb_strlen($node_text));
                $para_data[$add_len][] = mb_substr($node_text, 0, $add_len);
                $len += $add_len;
                if ($len > TextProcessor::$max_description_len) {
                    break 2;
                }
                if (in_array($part, ["//p[1]", "//div[1]",
                    "//div[2]", "//p[2]", "//p[3]",
                    "//div[3]", "//div[4]", "//p[4]"])) {
                    break;
                }
            }
        }
        krsort($para_data);
        foreach ($para_data as $add_len => $data) {
            if (!isset($first_len)) {
                $first_len = $add_len;
            }
            foreach ($data as $datum) {
                $datum = PhraseParser::compressSentence($datum, $lang);
                $description .= " ..\n ". $datum;
                if (self::OUTPUT_TO_FILE) {
                    if ($output_file_contents == "") {
                        $output_file_contents = trim($datum);
                    } else {
                        $output_file_contents = $output_file_contents .
                            "\n" . trim($datum);
                    }
                }
            }
            if ($first_len > 3 * $add_len) break;
        }
        $description = preg_replace("/(\s)+/u", " ",  $description);
        if (self::OUTPUT_TO_FILE) {
            file_put_contents(C\WORK_DIRECTORY . self::OUTPUT_FILE_PATH,
                $output_file_contents);
        }
        return $description;
    }
}
ViewGit