Last commit for src/library/LSMTree.php: 88ba842636f692ac9bde972fed5a3cf6959d841b

Allows Arctool to rebuild/remerge a range of partitions, fixes term lookup bugs in WordIterator and IndexDocumentBundle

Chris Pollett [2024-02-04 02:Feb:th]
Allows Arctool to rebuild/remerge a range of partitions, fixes term lookup bugs in WordIterator and IndexDocumentBundle
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2024  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2024
 * @filesource
 */
namespace seekquarry\yioop\library;

use seekquarry\yioop\configs as C;
/**
 * This class implements a Log Structured merge tree data structure suitable for
 * storing and retrieving sortable records
 *
 * @author Chris Pollett
 */
class LSMTree
{
    /**
     *
     */
    const BLOCK_FACTOR = 10000;
    /**
     *
     */
    const RECORD_CACHE_SIZE = 100;
    /**
     *
     */
    const DATA_FILE_PREFIX = "D";
    /**
     *
     */
    const FOLDER_PREFIX = "F";
    /**
     *
     */
    const INDEX_FILE = "index.txt";
    /**
     *
     */
    const MAX_FILE_SIZE = 32768;
    /**
     *
     */
    const MAX_TIER_FILENAME = "max_tier.txt";
    /**
     *
     */
    const TIER_PREFIX = "Tier";
    /**
     * @var string
     */
    public $base_slot = "A";
    /**
     * @var string
     */
    public $block_factor;
    /**
     * Folder for storing the LSMTree files
     * @var string
     */
    public $folder;
    /**
     * @var int
     */
    public $max_file_size;
    /**
     * @var Tier
     */
    public $put_slot = null;
    /**
     *
     */
    public $table_tools;
    /**
     * The highest tiered index in the LSMTree
     * @var int
     */
    public static $max_tier;
    /**
     * Creates/Loads LSM-Tree having specified folder and minimum_degree.
     *
     * @param string $folder is the folder for storing the B+-Tree files
     * @param array $format the column names, keys and types for this
     *  LSTM  object
     * @param int $max_file_size
     */
    public function __construct($folder, $format, $max_file_size =
        self::MAX_FILE_SIZE, $block_factor = self::BLOCK_FACTOR)
    {
        if (empty($format['PRIMARY KEY'][1]) || $format['PRIMARY KEY'][1] < 0) {
            throw new \Exception(
                "LSMTree class requires fixed lengthed keys");
        }
        $this->folder = $folder;
        if (!file_exists($folder)) {
            mkdir($folder);
            chmod($folder, 0777);
        }
        $this->getMaxTier(); //computes max tier if not set yet
        $this->table_tools = new PackedTableTools($format);
        $this->max_file_size = $max_file_size;
        $this->block_factor = $block_factor;
    }
    /**
     *
     */
    public function getTierFolder($tier)
    {
        return $this->folder . "/" . self::TIER_PREFIX .
            sprintf("%'.04d", $tier);
    }
    /**
     *
     */
    public function occupiedTier($tier)
    {
        return file_exists($this->getTierFolder($tier) . "/A");
    }
    /**
     *
     */
    public function getMaxTier($recompute = false)
    {
        if (!$recompute && isset(self::$max_tier)) {
            return self::$max_tier;
        }
        $max_path = $this->folder . "/" . self::MAX_TIER_FILENAME;
        if (!$recompute && file_exists($max_path)) {
            return self::$max_tier = intval(file_get_contents($max_path));
        }
        $tier_path = $this->folder . "/" . self::TIER_PREFIX;
        $tiers = glob($tier_path . "*");
        sort($tiers);
        $last_tier = (!empty($tiers) && count($tiers) > 0) ?
            $tiers[count($tiers) - 1] : $this->folder . "/" .
            self::TIER_PREFIX . "0";
        $max_tier = intval(substr($last_tier,
            strlen($tier_path)));
        file_put_contents($max_path, $max_tier);
        return self::$max_tier = $max_tier;
    }
    /**
     *
     */
    public function selectPutSlot($letter)
    {
        $this->base_slot = ($letter == "A") ? "A" : "B";
        $this->put_slot = null;
    }
    /**
     *
     */
    public function put($row)
    {
        if (empty($this->put_slot)) {
            $put_folder = $this->getTierFolder(0) . "/" . $this->base_slot;
            $this->put_slot = new Tier($put_folder, $this->table_tools, "w",
                $this->max_file_size, $this->block_factor);
        }
        $this->put_slot->put($row , false);
    }
    /**
     * Flushes any in-memory data into the active slot at tier 0.
     */
    public function flush()
    {
        if (!empty($this->put_slot)) {
            $this->put_slot->flush();
        }
    }
    /**
     * Merges any tiers with both slots filled in the LSTM into
     * a tier in a slot one level higher
     */
    public function mergeTiers()
    {
        crawlLog("..Begin LSMTiers Merging Tiers..");
        $max_tier = $this->getMaxTier();
        crawlLog("....begin Max Tier is: $max_tier");
        for ($i = 0; $i <= $max_tier; $i++)
        {
            $tier_folder = $this->getTierFolder($i);
            if (file_exists($tier_folder . "/B")) {
                crawlLog("....Merging Tier $i");
                $this->mergeTier($i);
            } else {
                crawlLog("....Tier $i doesn't need to be merged.");
            }
        }
        $max_tier = $this->getMaxTier(true);
        crawlLog("....end Max Tier is: $max_tier");
        crawlLog("..End LSMTiers Merging Tiers..");
    }
    /**
     *
     */
    public function emptyTier($tier)
    {
        $tier_folder = $this->getTierFolder($tier);
        $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager";
        $db = new $db_class();
        $db->unlinkRecursive($tier_folder, true);
    }
    /**
     *
     */
    public function compare($entry_a, $entry_b)
    {
        $key_len = $this->table_tools->key_len;
        return strncmp($entry_a, $entry_b, $key_len);
    }
    /**
     *
     */
    public function mergeEntries($entry_a, $entry_b)
    {
        $table_tools = $this->table_tools;
        list($encoded_key, $values_a) =
            entryToKeyValues($entry_a, $table_tools);
        list(, $values_b)  = entryToKeyValues($entry_b, $table_tools);
        $out_values = $table_tools->mergeRowValues($values_a, $values_b);
        return $encoded_key . $out_values;
    }
    /**
     *
     */
    public function mergeTier($tier)
    {
        $tier_folder = $this->getTierFolder($tier);
        if (!file_exists($tier_folder . "/B")) {
            return;
        }
        if (!file_exists($tier_folder . "/A")) {
            rename($tier_folder . "/B", $tier_folder . "/A");
            return;
        }
        $a_tier = new Tier($tier_folder . "/A", $this->table_tools);
        $b_tier = new Tier($tier_folder . "/B", $this->table_tools);
        $next_tier_folder = $this->getTierFolder($tier + 1);
        $target_folder = ($this->occupiedTier($tier + 1)) ?
            $next_tier_folder . "/B" : $next_tier_folder . "/A";
        $target_tier = new Tier($target_folder, $this->table_tools, "w",
            $this->max_file_size, $this->block_factor);
        $a_row = $a_tier->next();
        $b_row = $b_tier->next();
        $cnt = 0;
        while(!empty($a_row) || !empty($b_row)) {
            if (empty($b_row) && !empty($a_row)) {
                $target_tier->put($a_row);
                $a_row = $a_tier->next();
            } else if (empty($a_row) && !empty($b_row)) {
                $target_tier->put($b_row);
                $b_row = $b_tier->next();
            } else if (!empty($a_row) && !empty($b_row)) {
                $cmp = $this->compare($a_row, $b_row);
                if ($cmp < 0) {
                    $target_tier->put($a_row);
                    $a_row = $a_tier->next();
                } else if ($cmp > 0) {
                    $target_tier->put($b_row);
                    $b_row = $b_tier->next();
                } else {
                    $target_tier->put($this->mergeEntries($a_row, $b_row));
                    $a_row = $a_tier->next();
                    $b_row = $b_tier->next();
                }
            }
            $cnt++;
            crawlTimeoutLog("......have merged $cnt items of Tier $tier.");
        }
        $target_tier->flush();
        $this->emptyTier($tier);
    }
    /**
     * Returns the record associated with a $key as stored in the LSMTree.
     * If $key is not present in tree then returns null
     * @param string $key to look up in current LSMTree
     * @param int $offset starting record item associated with this key
     *      to return
     * @param int $limit maximum number of record items associated with this
     *      key to return, null == till end
     * @return array of items associated with this key in the LSMTree
     */
    public function get($key, $offset = 0, $limit = null)
    {
        $max_tier = $this->getMaxTier();
        $rows = [];
        $max_rows = $offset + ($limit ?? 0);
        for ($i = $max_tier; $i >= 0; $i--)
        {
            $add_rows = $this->getTier($i, $key);
            if (is_array($add_rows)) {
                $rows += $add_rows;
            }
            if ($limit > 0 && count($rows) > $max_rows) {
                break;
            }
        }
        if ($offset > 0 || $limit > 0) {
            $rows = array_slice($rows, $offset, $limit);
        }
        return $rows;
    }
    /**
     *
     */
    public function getTier($tier, $key)
    {
        $slot_folder = $this->getTierFolder($tier) . "/A";
        if (!file_exists($slot_folder)) {
            return null;
        }
        $slot = new Tier($slot_folder, $this->table_tools);
        return $slot->get($key);
    }
    /**
     * Save the operating parameters of this LSMTree
     */
    public function saveParameters()
    {
        $parameter_path = $this->folder . "/" . self::PARAMETERS_FILE;
        file_put_contents($parameter_path, serialize($this->parameters),
            LOCK_EX);
    }
    /**
     * Returns the parameters (such as its signature, max keys per nodes, etc)
     * used to configure the LSMTree stored at $folder
     *
     * @param string $folder file path to a stored LSMTree
     * @return array configuration info about the LSMTree
     */
    public static function getParameterInfo($folder)
    {
        $parameter_path = $folder . "/" . self::PARAMETERS_FILE;
        if(file_exists($parameter_path)) {
            $parameters = unserialize(file_get_contents($parameter_path)) ?? [];
            return $parameters;
        } else {
            return [];
        }
    }
}
/**
 *
 */
function entryToKeyValues($entry, $table_tools, $decode_key = false)
{
    $key_len = $table_tools->key_len;
    $key = substr($entry, 0, $key_len);
    $values = substr($entry, $key_len);
    return [$key, $values];
}
/**
 *
 */
class Tier
{
    /**
     *
     */
    public $block_factor;
    /**
     *
     */
    public $first_active_key;
    /**
     *
     */
    public $folder;
    /**
     *
     */
    public $iterator_folder_index;
    /**
     *
     */
    public $iterator_folders;
    /**
     *
     */
    public $iterator_file_index;
    /**
     *
     */
    public $iterator_files;
    /**
     *
     */
    public $iterator_entry_index;
    /**
     *
     */
    public $iterator_entries;
    /**
     *
     */
    public $max_file_size;
    /**
     *
     */
    public $mode;
    /**
     *
     */
    public $table_tools;
    /**
     *
     */
    private $records;
    /**
     *
     */
    private $active_filename;
    /**
     *
     */
    private static $cache = [];
    /**
     *
     */
    public function __construct($folder, $table_tools, $mode = "r",
        $max_file_size = LSMTree::MAX_FILE_SIZE,
        $block_factor = LSMTree::BLOCK_FACTOR)
    {
        $this->folder = $folder;
        $this->table_tools = $table_tools;
        $this->mode = $mode;
        $this->max_file_size = $max_file_size;
        $this->block_factor = $block_factor;
        $folder_exists = file_exists($folder);
        if (!$folder_exists) {
            if ($mode == "w") {
                makePath($folder);
            } else {
                return null;
            }
        }
        if ($mode == "w") {
            $this->records = "";
        }
        $this->reset();
    }
    /**
     *
     */
    public function flush()
    {
        if ($this->mode != "w" || empty($this->first_active_key)) {
            return false;
        }
        if (empty($this->records)) {
            return true;
        }
        $old_cwd = getcwd();
        chdir($this->folder);
        $tier_folders = glob(LSMTree::FOLDER_PREFIX . "*");
        $tiers_changed = false;
        if (empty($tier_folders)) {
            $active_folder = LSMTree::FOLDER_PREFIX .
                rawurlencode($this->first_active_key);
            mkdir($active_folder);
            chmod($active_folder, 0777);
            $tier_folders = [$active_folder];
            $tiers_changed = true;
            $data_files = [];
        } else {
            $last_folder = $tier_folders[count($tier_folders) - 1];
            chdir($last_folder);
            $data_files = glob(LSMTree::DATA_FILE_PREFIX . "*");
            if (empty($data_files)) {
                $data_files = [];
            }
            chdir($this->folder);
            $num_data_files = count($data_files);
            if ($num_data_files >= $this->block_factor) {
                $active_folder = LSMTree::FOLDER_PREFIX .
                    rawurlencode($this->first_active_key);
                mkdir($active_folder);
                $data_files = [];
                $tier_folders[] = $active_folder;
                $tiers_changed = true;
            } else {
                $active_folder = $last_folder;
            }
        }
        chdir($old_cwd);
        $active_path = $this->folder . "/" . $active_folder;
        $data_file = LSMTree::DATA_FILE_PREFIX . rawurlencode(
            $this->first_active_key);
        $data_files[] = $data_file;
        file_put_contents("$active_path/$data_file", $this->records);
        $this->writeRecords("$active_path/" . LSMTree::INDEX_FILE, $data_files);
        if ($tiers_changed) {
            $this->writeRecords($this->folder . "/" . LSMTree::INDEX_FILE,
                $tier_folders);
        }
        $this->records = "";
        return true;
    }
    /**
     *
     */
    public function get($key)
    {
        $tier_folders = $this->readRecords(
            $this->folder . "/" . LSMTree::INDEX_FILE);
        $url_encoded_key = rawurlencode($key);
        $key_folder = $this->binarySearch(LSMTree::FOLDER_PREFIX .
            $url_encoded_key, $tier_folders);
        if (!$key_folder) {
            return false;
        }
        $key_path = $this->folder . "/$key_folder";
        $data_files = $this->readRecords("$key_path/" . LSMTree::INDEX_FILE);
        $data_file = $this->binarySearch(LSMTree::DATA_FILE_PREFIX .
            $url_encoded_key, $data_files);
        if (!$data_file) {
            return false;
        }
        $table_tools = $this->table_tools;
        $key_len = $table_tools->key_len;
        $record_string = file_get_contents("$key_path/$data_file");
        $start = strpos($record_string, "\xFF$key");
        if ($start) {
            $start++;
        } else {
            if (strncmp($record_string, $key, $key_len) != 0) {
                return false;
            }
            $start = 0;
        }
        $end = strpos($record_string, "\xFF", $start);
        $end = ($end > 0) ? $end : strlen($record_string);
        $length = $end - $start;
        $record = decode255(substr($record_string, $start, $end));
        $values = substr($record, $key_len);
        return $table_tools->unpack($values);
    }
    /**
     *
     */
    public function put($row, $is_packed = true)
    {
        if($this->mode != "w") {
            return false;
        }
        $table_tools = $this->table_tools;
        if (!$is_packed) {
            $key = $row[$table_tools->key_field];
            $packed_row = $table_tools->pack($row);
            $entry = $key . $packed_row;
        } else {
            $entry = $row;
        }
        $encoded_entry = encode255($entry);
        if (strlen($this->records) + strlen($encoded_entry) + 1 >
            $this->max_file_size) {
            $this->flush();
        }
        if (empty($this->records)) {
            list($this->first_active_key,) = entryToKeyValues($entry,
                $table_tools, true);
        }
        $separator = (strlen($this->records) > 0) ? "\xFF" : "";
        $this->records .= $separator . $encoded_entry;
        return true;
    }
    /**
     *
     */
    public function binarySearch($needle, $haystack)
    {
        $low = 0;
        $high = count($haystack) - 1;
        if ($high < 0 || strcmp($needle, $haystack[$low]) < 0) {
            return false;
        }
        if (strcmp($needle, $haystack[$high]) >= 0) {
            return $haystack[$high];
        }
        while ($high - $low > 1) {
            $mid = ($high + $low) >> 1;
            $cmp = strcmp($needle, $haystack[$mid]);
            if ($cmp == 0) {
                return $haystack[$mid];
            } else if ($cmp < 0) {
                $high = $mid;
            } else {
                $low = $mid;
            }
        }
        return $haystack[$low];
    }
    /**
     *
     */
    public function firstEntry()
    {
        $folder = $this->folder;
        $this->iterator_folders = $this->readRecords("$folder/" .
            LSMTree::INDEX_FILE);
        if (empty($this->iterator_folders) ||
            !is_array($this->iterator_folders)) {
            return false;
        }
        $this->iterator_folder_index = 0;
        $iterator_folder = $this->iterator_folders[0];
        $file_path = "$folder/$iterator_folder";
        $this->iterator_files = $this->readRecords( "$file_path/" .
            LSMTree::INDEX_FILE);
        if (empty($this->iterator_files) ||
            !is_array($this->iterator_files)) {
            return false;
        }
        $this->iterator_file_index = 0;
        $iterator_file = $this->iterator_files[0];
        $this->iterator_entries = $this->readRecords(
            "$file_path/{$iterator_file}", "\xFF");
        if (empty($this->iterator_entries) ||
            !is_array($this->iterator_entries)) {
            return false;
        }
        $this->iterator_entry_index = 0;
        return decode255($this->iterator_entries[0]);
    }
    /**
     *
     */
    public function next()
    {
        if (empty($this->iterator_folders)) {
            return $this->firstEntry();
        }
        $this->iterator_entry_index++;
        $entry = $this->iterator_entries[$this->iterator_entry_index] ?? false;
        if ($entry) {
            return decode255($entry);
        }
        $folder = $this->folder;
        $this->iterator_entry_index = 0;
        $this->iterator_file_index++;
        $file = $this->iterator_files[$this->iterator_file_index] ?? false;
        if ($file) {
            $iterator_folder = $this->iterator_folders[
                $this->iterator_folder_index];
            $file_path = "$folder/$iterator_folder/$file";
            $this->iterator_entries = $this->readRecords($file_path, "\xFF");
            return decode255($this->iterator_entries[0]) ?? false;
        }
        $this->iterator_file_index = 0;
        $this->iterator_folder_index++;
        $iterator_folder = $this->iterator_folders[
            $this->iterator_folder_index] ?? false;
        if (!$iterator_folder) {
            return false;
        }
        $folder_path = "$folder/$iterator_folder";
        $this->iterator_files = $this->readRecords("$folder_path/" .
            LSMTree::INDEX_FILE);
        if (empty($this->iterator_files) ||
            !is_array($this->iterator_files)) {
            return false;
        }
        $file = $this->iterator_files[0];
        $file_path = "$folder_path/$file";
        $this->iterator_entries = $this->readRecords($file_path, "\xFF");
        return decode255($this->iterator_entries[0]) ?? false;
    }
    /**
     *
     */
    public function reset()
    {
        $this->iterator_folder_index = 0;
        $this->iterator_folders = [];
        $this->iterator_file_index = 0;
        $this->iterator_files = [];
        $this->iterator_entry_index = 0;
        $this->iterator_entries = [];
    }
    /**
     *
     */
    function writeRecords($filename, $lines, $delimiter = "\n")
    {
        file_put_contents($filename, implode($delimiter, $lines));
        $name_hash = crawlHash($filename);
        unset(self::$cache[$name_hash]);
    }
    /**
     *
     */
    function readRecords($filename, $delimiter = "\n")
    {
        $name_hash = crawlHash($filename);
        if (isset(self::$cache[$name_hash])) {
            $tmp = self::$cache[$name_hash];
            unset(self::$cache[$name_hash]);
            self::$cache[$name_hash] = $tmp; //move to end of array
            return $tmp;
        }
        self::$cache[$name_hash] =
            explode($delimiter, file_get_contents($filename));
        if (count(self::$cache[$name_hash]) >= LSMTRee::RECORD_CACHE_SIZE) {
            array_shift(self::$cache);
        }
        return self::$cache[$name_hash];
    }
}
ViewGit