Last commit for src/library/HashTable.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\library;

/**
 *
 * Code used to manage a memory efficient hash table
 * Weights for the queue must be flaots
 *
 * @author Chris Pollett
 */
class HashTable extends StringArray
{
    /**
     * The size in bytes for keys stored in the hash table
     *
     * @var int
     */
    public $key_size;
    /**
     * The size in bytes of values associated with values
     *
     * @var int
     */
    public $value_size;
    /**
     * Holds an all \0 string used of length $this->key_size
     * @var string
     */
    public $null;
    /**
     * Holds \0\0 followed by an all \FF string of length $this->key_size -1
     * Used to indicate that a slot once held data but that data was deleted.
     * Such a slot tells a lookup to keep going, but on an insert can be
     * overwritten in the inserted key is not already in the table
     * @var string
     */
    public $deleted;
    /**
     * Number of items currently in the hash table
     * @var int
     */
    public $count;
    /**
     * Flag for hash table lookup methods
     */
    const ALWAYS_RETURN_PROBE = 1;
    /**
     * Flag for hash table lookup methods
     */
    const RETURN_PROBE_ON_KEY_FOUND = 0;
    /**
     * Flag for hash table lookup methods
     */
    const RETURN_VALUE = -1;
    /**
     * Flag for hash table lookup methods
     */
    const RETURN_BOTH = -2;
    /**
     * Makes a persistently stored (i.e., on disk and ram)  hash table using the
     * supplied parameters
     *
     * @param string $fname filename to use when storing the hash table to disk
     * @param int $num_values number of key value pairs the table can hold
     * @param int $key_size number of bytes to store a hash table key
     * @param int $value_size number of bytes to store a hash table value
     * @param int $save_frequency how many non read operation before saving to
     *     disk
     */
    public function __construct($fname, $num_values, $key_size, $value_size,
        $save_frequency = self::DEFAULT_SAVE_FREQUENCY)
    {
        $this->key_size = $key_size;
        $this->value_size = $value_size;
        $this->null = pack("x". $this->key_size);
        $this->deleted = pack("H2x".($this->key_size - 1), "FF");
        $this->count = 0;
        parent::__construct($fname, $num_values,
            $key_size + $value_size, $save_frequency);
    }
    /**
     * Inserts the provided $key - $value pair into the hash table
     *
     * @param string $key the key to use for the insert (will be needed for
     *     lookup)
     * @param string $value the value associated with $key
     * @param int $probe if the location in the hash table is already known
     *     to be $probe then this variable can be used to save a lookup
     * @return bool whether the insert was successful or not
     */
    public function insert($key, $value, $probe = false)
    {
        $null = $this->null;
        $deleted = $this->deleted;

        if ($probe === false) {
            $probe = $this->lookup($key, self::ALWAYS_RETURN_PROBE);
        }
        if ($probe === false) {
            /* this is a little slow
               the idea is we can't use deleted slots until we are sure
               $key isn't in the table
             */
            $probe = $this->lookupArray(
                $key, [$null, $deleted], self::ALWAYS_RETURN_PROBE);
            if ($probe === false) {
                crawlLog("No space in hash table");
                return false;
            }
        }
        //there was a free slot so write entry...
        $data = pack("x". ($this->key_size + $this->value_size));
        if (strlen($value) < $this->value_size) {
            /* this case should not happen, rather
                give an error we null terminate the string to the desired
                length
             */
            $value = str_pad($value, $this->value_size, '\0');
        }
        //first the key
        for ($i = 0; $i < $this->key_size; $i++) {
            $data[$i] = $key[$i];
        }
        //then the value
        for ($i = 0; $i < $this->value_size; $i++) {
            $data[$i + $this->key_size] = $value[$i];
        }
        $this->put($probe, $data);
        $this->count++;
        $this->checkSave();
        return true;
    }
    /**
     * Tries to lookup the key in the hash table either return the
     * location where it was found or the value associated with the key.
     *
     * @param string $key key to look up in the hash table
     * @param int $return_probe_value one of self::ALWAYS_RETURN_PROBE,
     *     self::RETURN_PROBE_ON_KEY_FOUND, self::RETURN_VALUE, or self::BOTH.
     *     Here value means the value associated with the key and probe is
     *     either the location in the array where the key was found or
     *     the first location in the array where it was determined the
     *     key could not be found.
     * @return mixed would be string if the value is being returned,
     *     an int if the probe is being returned, and false if the key
     *     is not found
     */
    public function lookup($key, $return_probe_value = self::RETURN_VALUE)
    {
        return $this->lookupArray(
            $key, [$this->null], $return_probe_value);
    }
    /**
     * Tries to lookup the key in the hash table either return the
     * location where it was found or the value associated with the key.
     * If the key is not at the initial probe value, linear search in the
     * table is done. The values which cut-off the search are stored in
     * $null_array. Using an array allows for flexibility since a deleted
     * entry needs to be handled different when doing a lookup then when
     * doing an insert.
     *
     * @param string $key key to look up in the hash table
     * @param array $null_array key values that would cut-off the search
     *     for key if the initial probe failed
     * @param int $return_probe_value one of self::ALWAYS_RETURN_PROBE,
     *     self::RETURN_PROBE_ON_KEY_FOUND, or self::RETURN_VALUE. Here
     *     value means the value associated with the key and probe is
     *     either the location in the array where the key was found or
     *     the first location in the array where it was determined the
     *     key could not be found.
     * @return mixed would be string if the value is being returned,
     *     an int if the probe is being returned, and false if the key
     *     is not found
     */
    public function lookupArray($key, $null_array,
        $return_probe_value = self::RETURN_VALUE)
    {
        $index = $this->hash($key);
        $num_values = $this->num_values;
        $probe_array = [self::RETURN_PROBE_ON_KEY_FOUND,
            self::ALWAYS_RETURN_PROBE];
        for ($j = 0; $j < $num_values; $j++)  {
            $probe = ($index + $j) % $num_values;
            list($index_key, $index_value) = $this->getEntry($probe);
            if (in_array($index_key, $null_array)) {
                if ($return_probe_value == self::ALWAYS_RETURN_PROBE) {
                    return $probe;
                } else {
                    return false;
                }
            }

            if (strcmp($key, $index_key) == 0) { break; }
        }

        if ($j == $num_values) {return false;}

        $result = $index_value;
        if (in_array($return_probe_value, $probe_array)) {
            $result = $probe;
        }
        if ($return_probe_value == self::RETURN_BOTH) {
            $result = [$probe, $index_value];
        }
        return $result;
    }
    /**
     * Deletes the data associated with the provided key from the hash table
     *
     * @param string $key the key to delete the entry for
     * @param int $probe if the location in the hash table is already known
     *     to be $probe then this variable can be used to save a lookup
     * @return bool whether or not something was deleted
     */
    public function delete($key, $probe = false)
    {
        $deleted = pack("H2x".($this->key_size + $this->value_size - 1), "FF");
            //deletes
        if ($probe === false) {
            $probe = $this->lookup($key, self::RETURN_PROBE_ON_KEY_FOUND);
        }
        if ($probe === false) { return false; }
        $this->put($probe, $deleted);
        $this->count--;
        $this->checkSave();
        return true;
    }
    /**
     * Get the ith entry of the array for the hash table (no hashing here)
     *
     * @param int $i an index of the hash table array
     * @return array the key value pair stored at this index
     */
    public function getEntry($i)
    {
        $raw = $this->get($i);
        $key = substr($raw, 0, $this->key_size);
        $value = substr($raw, $this->key_size, $this->value_size);
        return [$key, $value];
    }
    /**
     * Hashes the provided key to an index in the array of the hash table
     *
     * @param string $key a key to hashed into the hash table
     * @return int an index in the array of the hash table
     */
    public function hash($key)
    {
        $tmp = md5($key, true);
        $pre_index = ((ord($tmp[0]) << 8) + ord($tmp[1]) << 8) + ord($tmp[2]);
        $index = floor($pre_index * $this->num_values/(2 << 23));
        return $index;
    }
    /**
     * Pretty prints the contents of the hash table viewed as an array.
     *
     */
    public function printContents()
    {
        for ($i = 1; $i <= $this->num_values; $i++) {
            $row = $this->getEntry($i);
            print "Entry: $i Key:".$row[0]." Value: ".$row[1]."\n";
        }
    }
}
ViewGit