Last commit for src/library/summarizers/CentroidWeightedSummarizer.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett (chris@pollett.org)
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\library\summarizers;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\library\CrawlConstants;
use seekquarry\yioop\library\PhraseParser;
use seekquarry\yioop\library\LinearAlgebra;

/**
 * Class which may be used by TextProcessors to get a summary for a text
 * document that may later be used for indexing. This is done by
 * the @see getSummmary method. To generate a summary a normalized
 * term frequency vector is computed for each sentence. An average
 * vector is then computed by summing these and renormalizing the result.
 * The computation of this average vector is biased by weighting earlier
 * sentences vectors more when computing the sum of vectors. This is done
 * using weight coming from a Zipf like distribution. Once an average
 * sentence is obtained, then sentences are score against it using
 * a residual cosine similarity score. I.e., the most important sentence
 * is determined by cosine rank. Then the components of this sentence in
 * the direction of the average sentence is deleted from the average sentence.
 * and the next most important sentence is computed by ranking against this
 * new average sentence vector and so on.
 * @author Charles Bocage (charles.bocage@sjsu.edu)
 *     rewritten Chris Pollett (chris@pollett.org)
 */
class CentroidWeightedSummarizer extends Summarizer
{
    /**
     * Generates a summary, word cloud, and summary scores based on
     *  the closeness of normalized term frequency vectors to an average
     *  term frequency vector for sentences.
     *
     * @param object $dom document object model of page to summarize
     * @param string $page complete raw page to generate the summary from.
     * @param string $lang language of the page to decide which stop words to
     *     call proper tokenizer.php of the specified language.
     *
     * @return array a triple (string summary, array word cloud, array
     *      of position => scores for positions within the summary)
     */
    public static function getSummary($dom, $page, $lang)
    {
        list($original_sentences, $sentences) =
            self::getPunctuatedUnpunctuatedSentences($dom, $page, $lang);
        list($terms, $tf_per_sentence_normalized) =
            self::computeTermFrequenciesPerSentence($sentences, $lang);
        $tf_average_sentence =
            self::getAverageSentence($tf_per_sentence_normalized);
        $word_cloud = self::wordCloudFromTermVector($tf_average_sentence);
        $sorted_sentence_scores =
            self::scoreSentencesVersusAverage($tf_per_sentence_normalized,
            $tf_average_sentence);
        list($summary, $summary_scores) = self::getSummaryFromSentenceScores(
            $sorted_sentence_scores, $original_sentences, $lang);
        /* Summary of text summarization */
        return [$summary, $word_cloud, $summary_scores];
    }

    /**
     * Computes an average sentence by adding the normalized term frequency
     * vectors for each sentence weighted by a Zipf like distribution on
     * sentence index and normalizing the resulting vector
     * @param array $term_frequencies_normalized the array with the terms as
     *      the key and its normalized frequency as the value
     * @return array a normalized vector of term => weights
     *
     */
    public static function getAverageSentence($term_frequencies_normalized)
    {
        $average_sentence = [];
        if (!empty($term_frequencies_normalized)) {
            foreach ($term_frequencies_normalized as
                $sentence_index => $term_frequencies_sentence) {
                /* used to slightly favor earlier sentences according
                   to a Zipf like behavior
                 */
                $sentence_weight = 1.0/(1.0 + pow($sentence_index, 0.80));
                foreach ($term_frequencies_sentence as
                    $term_index => $frequency) {
                    $frequency *= $sentence_weight;
                    $average_sentence[$term_index] =
                        (empty($average_sentence[$term_index])) ?
                        $frequency :
                        $average_sentence[$term_index] + $frequency;
                }
            }
            $average_sentence = LinearAlgebra::normalize($average_sentence);
            arsort($average_sentence);
        }
        return $average_sentence;
    }
    /**
     * Computes scores for each sentence => word vector in
     * an array of sentence => word_vectors based on
     * on how it compares versus an average sentence word vector
     * Here word vectors are normalized vectors and scores are determined
     * by inner product.
     *
     * @param array $sentence_vectors the array with the terms as
     *      the key and its normalized frequency as the value
     * @param array $average_sentence an array of each words average
     *      frequency value
     * @return array array of sentence index => score pairs
     */
    public static function scoreSentencesVersusAverage($sentence_vectors,
        $average_sentence)
    {
        $sentence_scores = [];
        $max_sentence_score = 1;
        while ($max_sentence_score > 0) {
            $max_sentence_score = -1;
            $max_sentence_index = -1;
            /* compute the most importance sentence vector based on
               current average
             */
            foreach ($sentence_vectors as $sentence => $word_vector) {
                $sentence_score = LinearAlgebra::dot($average_sentence,
                    $word_vector);
                if ($max_sentence_score < $sentence_score) {
                    $max_sentence_score = $sentence_score;
                    $max_sentence_index = $sentence;
                }
            }
            if ($max_sentence_score > -1) {
                // add most importants index and score to $sentence_scores
                $sentence_scores[$max_sentence_index] = $max_sentence_score;
                $word_vector = $sentence_vectors[$max_sentence_index];
                /* delete $word_vector (which is supposed to be normalized)
                   from average
                 */
                foreach ($word_vector as $term => $weight) {
                    $average_sentence[$term] -=
                        $average_sentence[$term] * $weight;
                }
                unset($sentence_vectors[$max_sentence_index]);
            }
        }
        return $sentence_scores;
    }
}
ViewGit