Last commit for src/library/summarizers/ScrapeSummarizer.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]

Adjust copyrights years

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Charles Bocage charles.bocage@sjsu.edu
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\library\summarizers;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\library\processors\PageProcessor;

/**
 * Class which may be used by TextProcessors to get a summary for a text
 * document that may later be used for indexing.
 *
 * @author Charles Bocage charles.bocage@sjsu.edu
 *  rewritten Chris Pollett chris@pollett.org
 */
class ScrapeSummarizer extends Summarizer
{
    /**
     * Scrapes the web document for important tags to make a summary
     *
     * @param object $dom   a document object to extract a description from.
     * @param string $page original page string to extract description from
     * @param string $lang language of the page to decide which stop words to
     *     call proper tokenizer.php of the specified language.
     *
     * @return array a a triple (string summary, array word cloud, array
     *      of position => scores for positions within the summary)
     */
    public static function getSummary($dom, $page, $lang)
    {
        $xpath = new \DOMXPath($dom);
        $metas = $xpath->evaluate("/html//meta");
        $max_summary_len = PageProcessor::$max_description_len;
        $block = "";
        $ellipsis = "";
        //look for a meta tag with a description
        foreach ($metas as $meta) {
            if (stristr($meta->getAttribute('name'), "description") &&
                trim($meta->getAttribute('content')) != "") {
                $block .= $ellipsis  . $meta->getAttribute('content');
                $ellipsis = " .. ";
            }
        }
        $block = trim($block);
        $pos = 0;
        $blocks = [];
        $block_ranks = [];
        if ($block != "") {
            $blocks[] = $block;
            $block_ranks[] = 3. * log(strlen($block) + 1);
        }
        $tag_weights = [ "h1" => 3., "h2" => 3., "pre" => 0.5, "li" => 0.5];
        $max_score = max($tag_weights) * strlen($page);
        $changeable_index = 0;
        $fixed_summary_len = 0;
        while($pos >= 0) {
            $old_pos = $pos;
            list($pos, $block, $tag_name) = self::offsetContentsNextTag(
                "h\d|div|pr?e?|th|td|li|dt|dd|article|section|cite", $page,
                $old_pos, true);
            $block = trim(strip_tags($block));
            $score_pos = pow(1 + $pos, 0.5);
            if (!empty($block)) {
                $sentences = self::getSentences($block);
                $weight = (empty($tag_weights[$tag_name])) ? 1.0 :
                    $tag_weights[$tag_name];
                foreach ($sentences as $sentence) {
                    $blocks[] = $sentence;
                    $block_ranks[] = ($weight * log(strlen($sentence) + 1)) /
                        $score_pos;
                }
            }
            if ($pos > 0 && !empty($block_ranks[$changeable_index]) &&
                $max_score/$score_pos < $block_ranks[$changeable_index]) {
                $fixed_summary_len += strlen($blocks[$changeable_index]);
                $changeable_index++;
                if ($fixed_summary_len > $max_summary_len) {
                    break;
                }
            }
        }
        arsort($block_ranks);
        list($summary, $summary_scores) = self::getSummaryFromSentenceScores(
            $block_ranks, $blocks, $lang);
        return [$summary, self::wordCloudFromSummary($summary,  $lang),
            $summary_scores];
    }
    /**
     * Return the contents of the next tag in $page after $offset which
     * matches $tag_regex. I.e, if an open tag foo matches $tag_regex,
     * then the nearest closing tag /foo is found after the open tag
     * and then in between contents is returned. If $lazy is true,
     * then if foo matches $tag_regex, then the nearest closing tag that
     * matches $tag_regex is found. This may or may not be foo. For example
     * $tag_regex might foo|goo. If a /goo was found before a /foo, then
     * the latest open tag for goo after offset and before /goo is searched for,
     * and the contents between this open an close goo are returned.
     *
     * @param string $tag_regex a regex string that matches for xml tag names
     *    not including < and >. I.e., the pattern p would match for a <p> tag
     *    p|h\d would match <p>,<h1>,<h2>, etc.
     * @param string $page string we are searching for tag contents
     * @param int $offset where to start searching from in $page
     * @param boolean $lazy whether to be greedy or lazy in the contents
     *      we return in the sense descriebd above.
     */
    public static function offsetContentsNextTag($tag_regex, $page,
        $offset = -1, $lazy = false)
    {
        list($pos, $start_tag) = L\preg_search(
            "/\<(" . $tag_regex . ")[^\>]*\>/u", $page, $offset, true);
        if ($pos == -1) {
            return [-1, "", $tag_regex];
        }
        $start_contents_pos = $pos + strlen($start_tag);
        if($lazy) {
            list($end_tag_pos, $end_tag) = L\preg_search(
                "/\<\/(" . $tag_regex . ")[^\>]*\>/", $page,
                $start_contents_pos, true);
            if ($end_tag_pos == -1) {
                return [-1, "", $tag_regex];
            }
            $tag_name_end_pos = L\preg_search("/\s|\>/", $end_tag, 2);
            if ($tag_name_end_pos == -1) {
                return [-1, ""];
            }
            $tag_name = substr($end_tag, 2, $tag_name_end_pos - 2);
            $at_least_once = false;
            $start_tag = "";
            $old_pos = $pos;
            while ($pos < $end_tag_pos && $pos > -1 && $pos != $old_pos) {
                $old_pos = $pos;
                $old_start_tag = $start_tag;
                list($pos, $start_tag) = L\preg_search(
                    "/\<(" . $tag_name . ")[^\>]*\>/", $page, $pos, true);
                $at_least_once = true;
            }
            if ($pos == -1 && !$at_least_once) {
                return [-1, "", $tag_regex];
            } else if (!$at_least_once) {
                $old_pos = $pos;
                $old_start_tag = $start_tag;
            }
            $start_contents_pos = $old_pos + strlen($old_start_tag);
        } else {
            $tag_name_end_pos = L\preg_search("/\s|\>/", $start_tag, 1);
            if ($tag_name_end_pos == -1) {
                return [-1, ""];
            }
            $tag_name = substr($start_tag, 1, $tag_name_end_pos - 1);
            list($end_tag_pos, $end_tag) = L\preg_search("/\<\/(" . $tag_name .
                ")[^\>]*\>/", $page, $start_contents_pos, true);
        }
        $contents = ($end_tag_pos == -1) ? substr($page, $start_contents_pos) :
            substr($page, $start_contents_pos,
                $end_tag_pos - $start_contents_pos);
        $end_pos = ($end_tag_pos == -1) ? -1 : $end_tag_pos + strlen($end_tag);
        return [$end_pos, $contents, $tag_name];
    }
}

ViewGit