Last commit for src/library/Trie.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Sandhya Vissapragada, Chris Pollett (separated out this
 *     code into a separate file and cleaned up)
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\library;

/**
 * Implements a trie data structure which can be used to store terms read
 * from a dictionary in a succinct way
 *
 * @author Sandhya Vissapragada, Chris Pollett (rewrite +
 *     documentation, multi-byte support)
 */
class Trie
{
    /**
     * A nested array used to represent the trie
     * @var array
     */
    public $trie_array;
    /**
     * The marker used to represent the end of an entry in a trie
     * @var string
     */
    public $end_marker;
    /**
     * Creates and returns an empty trie. Sets the end of term character
     *
     * @param string $end_marker end of term marker
     */
    public function __construct($end_marker = " ")
    {
        $this->trie_array = [];
        $this->end_marker = $end_marker;
    }
    /**
     * Adds a term to the Trie
     *
     * @param string $term the term to be inserted if a space separated
     *      term value pair. Then the term is inserted into the trie
     *      and the leaf label is the value.
     * @param bool $raw_url_decode decide term should be encoded in utf-8 or url
     * @return array $trie_array beneath last letter of term inserted
     */
    public function add($term, $raw_url_decode = true)
    {
        $trie_array = & $this->trie_array;
        $term_arr = explode(" ", $term);
        if (!isset($term_arr[1])) {
            $term_arr[1] = null;
        }
        for ($i = 0; $i < mb_strlen($term_arr[0], "utf-8"); $i++) {
            $character = mb_substr($term_arr[0], $i, 1, "utf-8");
            $enc_char = $raw_url_decode ? rawurlencode($character) : $character;
            // To avoid encoding the linefeed
            if ($enc_char == "%0A") {
                continue;
            } else {
                /* If letter doesn't exist then create one by
                    assigning new array
                 */
                if (!isset($trie_array[$enc_char])) {
                    $trie_array[$enc_char] = [];
                }
                $trie_array = & $trie_array[$enc_char];
            }
        }
        // Set end of term marker
        $trie_array[$this->end_marker] = $term_arr[1];
        return $trie_array;
    }
    /**
     * Returns the sub trie_array under $term in
     * $this->trie_array. If $term does not exist in $trie->trie_array
     * returns false
     *
     * @param string $term term to look up
     * @return array $trie_array subtrie under term
     */
    public function exists($term)
    {
        $trie_array = & $this->trie_array;
        $len = mb_strlen($term,"utf-8");
        for ($i = 0; $i < $len; $i++) {
            if ($trie_array == null) {
                return false;
            }
            if ($trie_array != $this->end_marker) {
                $character = mb_substr($term, $i, 1, "utf-8");
                $enc_char = rawurlencode($character);
                if (!isset($trie_array[$enc_char])) {
                    return false;
                }
                if ($trie_array[$enc_char] != $this->end_marker) {
                    $trie_array = & $trie_array[$enc_char];
                }
            } else {
                return false;
            }
        }
        return $trie_array;
    }
    /**
     * Returns all the terms in the trie beneath the provided term prefix
     *
     * @param string $prefix of term to look up
     * @param int $max_results maximum number of strings to return
     * @return array $terms under $prefix
     */
    public function getValues($prefix, $max_results)
    {
        $trie_array = $this->exists($prefix);
        if (!$trie_array) {
            return false;
        }
        return $this->getValuesTrieArray($trie_array, $prefix, $max_results);
    }
    /**
     * Computes the suffixes $count,...$max_results-$count in the trie_array
     * beneath the provided $find_more is true. Prepends $prefix to each
     * and returns the array of the result.
     *
     * @param array $trie_array a nested array representing a trie to look
     *     up suffixes in
     * @param string $prefix to prepend to each found suffix
     * @param int $max_results maximum number of strings to return
     * @param int $count which suffix in trie_array to start with
     * @param bool $find_more whether to try to look up or not (stops recursion)
     * @return array $terms a list of ($prefix.suffix1, $prefix, $suffix2,...)
     */
    private function getValuesTrieArray($trie_array, $prefix, $max_results,
        &$count = 0, &$find_more = true)
    {
        $end_marker = $this->end_marker;
        $terms = [];
        if ($trie_array != null && $find_more) {
            foreach ($trie_array as $character => $subtrie) {
                if ($character != $end_marker) {
                    $new_terms =
                        $this->getValuesTrieArray($subtrie,
                            $prefix . urldecode($character),
                            $max_results, $count, $find_more);
                    $terms = array_merge($terms, $new_terms);
                } else {
                    $count++;
                    if ($count > $max_results) {
                        $find_more = false;
                    }
                    $terms[] = $prefix;
                }
            }
        }
        return $terms;
    }
}
ViewGit