Last commit for src/library/summarizers/GraphBasedSummarizer.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Charles Bocage charles.bocage@sjsu.edu
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\library\summarizers;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library\CrawlConstants;
use seekquarry\yioop\library\PhraseParser;
use seekquarry\yioop\library\LinearAlgebra;

/**
 * Class which may be used by TextProcessors to get a summary for a text
 * document that may later be used for indexing. The method @see getSummary
 * is used to obtain such a summary. In GraphBasedSummarizer's implementation
 * of this method sentences are ranks using a page rank style algorithm
 * based on sentence adjacencies calculated using a distortion score between
 * pair of sentence (@see LinearAlgebra::distortion for details on this).
 * The page rank is then biased using a Zipf-like transformation to slightly
 * favor sentences earlier in the document
 *
 * @author Charles Bocage charles.bocage@sjsu.edu
 *      Chris Pollett chris@pollett.org
 */
class GraphBasedSummarizer extends Summarizer
{
    /**
     * This summarizer uses a page rank-like algorithm to find the
     * important sentences in a document, generate a word cloud, and
     * give scores for those sentences.
     *
     * @param object $dom document object model of page to summarize
     * @param string $page complete raw page to generate the summary from.
     * @param string $lang language of the page to decide which stop words to
     *     call proper tokenizer.php of the specified language.
     *
     * @return array a triple (string summary, array word cloud, array
     *      of position => scores for positions within the summary)
     */
    public static function getSummary($dom, $page, $lang)
    {
        $unmodified_doc = strip_tags($page,
            '<a><h1><h2><h3><h4><h5><h6><b><em><i><u><dl><ol><ul><title>');
        list($sentences_with_punctuation, $sentences) =
            self::getPunctuatedUnpunctuatedSentences($dom, $unmodified_doc,
            $lang);
        $sentences = PhraseParser::stemTermsK($sentences, $lang, true);
        list($terms, $tf_per_sentence_normalized) =
            self::computeTermFrequenciesPerSentence($sentences, $lang);
        $adjacency = self::computeAdjacency($tf_per_sentence_normalized);
        $sorted_sentence_ranks = self::getSentenceRanks($adjacency);
        list($summary, $summary_scores) = self::getSummaryFromSentenceScores(
            $sorted_sentence_ranks, $sentences_with_punctuation, $lang);
        return [$summary, self::wordCloudFromSummary($summary,  $lang),
            $summary_scores];
    }
    /**
     * Compute the sentence ranks using power method.
     * Take adjacency matrix and apply it 10 times to starting sentence
     * ranks, all of which are 1/n where n is the number of sentences.
     * We assume pr = (A^{10}r) approximates (A^{11}r) and so A*pr = pr,
     * i.e, the pr vector is a eigentvector of A and its components
     * approximate the importance of each sentence. After computing
     * ranks in this way, we multiply the components of the resulting
     * vector to slightly bias the results to favor earlier sentences in
     * the document using a Zipf-like distribution on sentence order.
     *
     * @param array $adjacency the adjacency matrix (normalized to
     *      satisfy conditions for power method to converge) generated for the
     *      sentences
     * @return array the sentence ranks
     */
    public static function getSentenceRanks($adjacency)
    {
        $n = count($adjacency);
        $sentence_ranks = array_fill(0, $n, 1.0 / $n);
        for ($i = 0; $i < 10; $i++ ) {
            $sentence_ranks = LinearAlgebra::multiply($adjacency,
                $sentence_ranks);
        }
        // bias start of doc according to a zipf like weighting
        foreach ($sentence_ranks as $index => $weight) {
            $weight *= 1.0/(1.0 + pow($index, 0.4));
            $sentence_ranks[$index] = $weight;
        }
        arsort($sentence_ranks);
        return $sentence_ranks;
    }
    /**
     * Compute the adjacency matrix based on its distortion measure
     * @param array $tf_per_sentence_normalized the array of term frequencies
     * @return array the array of sentence adjacency
     */
    public static function computeAdjacency($tf_per_sentence_normalized)
    {
        $result = [[]];
        $n = count($tf_per_sentence_normalized);
        for ($i = 0; $i < $n; $i++ ) {
            $result[$i][$i] = 0;
            for ($j = $i + 1; $j < $n; $j++ ) {
                $result[$i][$j] = LinearAlgebra::distortion(
                    $tf_per_sentence_normalized[$i],
                    $tf_per_sentence_normalized[$j]);
                $result[$j][$i] = $result[$i][$j];
            }
        }
        return $result;
    }
}
ViewGit