Last commit for src/library/BPlusTree.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\library;

use seekquarry\yioop\configs as C;
/**
 * This class implements the B+-tree structure over existing file system
 *
 * @author Chris Pollett
 */
class BPlusTree
{
    /**
     * The default compressor object used to compress tree nodes
     */
    const DEFAULT_COMPRESSOR = C\NS_COMPRESSORS . "NonCompressor";
    /**
     * The MAXIMUM number of keys stored in a tree node (will split if have
     * more)
     */
    const MAX_KEYS = 501;
    /**
     * Maximum size of a cache of any type used by this BPlusTree
     */
    const MAX_CACHE_SIZE = 200;
    /**
     * Default parameters to use when constructing a BPlusTree
     */
    const DEFAULT_PARAMETERS = ["NODE_COMPRESSOR" => self::DEFAULT_COMPRESSOR,
        "BLOB_COMPRESSOR" => self::DEFAULT_COMPRESSOR,
        "FORMAT" => ["PRIMARY KEY" => ["KEY", -1], "VALUE" => "BLOB"],
        "MAX_KEYS" => self::MAX_KEYS
    ];
    /**
     * Internal nodes of BPlusTree are folders, subfolders/subfiles are
     * names according to their least key except for the first subdfolder/
     * subfile of the node which is given the name of the LEAST_NODE_NAME
     * constant
     */
    const LEAST_NODE_NAME = "start";
    /**
     * Internal nodes of BPlusTree are folders. For nodes of the same
     * height in the tree NEXT_NODE_NAME is used as the name of the file
     * with the serialized name of the next folder of the same height in
     * the tree.
     */
    const NEXT_NODE_NAME = "next";
    /**
     * Name of temporary file used when splitting a BPlusTree node.
     */
    const TEMP_NODE_NAME = "tmp_node";
    /**
     * Leaf node archive file (used for blob and serial record columns) name
     * prefix
     */
    const ARCHIVE_PREFIX = "a";
    /**
     * Leaf node index record file name prefix
     */
    const NODE_PREFIX = "i";
    /**
     * File name of file used to store the parameters of this
     * BPlusTree
     */
    const PARAMETERS_FILE = "bpt_parameters.txt";
    /**
     * Field variable used as a cache for the file handle, file name, and
     * time of the last archive file for a B+-tree node added to
     * @var array
     */
    public $add_archive_cache = [null, "", -1];
    /**
     * Field variable used as a cache for the file handle, file name, and
     * time of the last archive file for a B+-tree node accessed for a value
     * @var array
     */
    public $get_archive_cache = [null, "", -1];
    /**
     * Array of column names for the columns in a BPlusTree which
     * are of type BLOB or SERIAL
     * @var array
     */
    public $blob_columns;
    /**
     * The seekquarry\yioop\library\compressors\Compressor object used to
     * compress node files.
     * @var object
     */
    public $node_compressor;
    /**
     * The seekquarry\yioop\library\compressors\Compressor object used to
     * compress blob columns.
     * @var object
     */
    public $blob_compressor;
    /**
     * Folder for storing the B-Tree files
     * @var string
     */
    public $folder;
    /**
     * A cache for contents of nodes that have been recently added to.
     * consists of path_to_node => node_content pairs
     * @var array
     */
    public $insert_node_cache = [];
    /**
     * Used to keep track of when this instance was created, as part of managing
     * file handles expiration (could be set/updated externally to reflect
     * some other instance using the BPlusTree)
     * @var int
     */
    public $instance_time;
    /**
     * Name of primary key column for records
     * @var string
     */
    public $key_field;
    /**
     * Last folder path of a find operation, provided this was cacheable
     * @var string
     */
    public $last_find_folder = null;
    /**
     * Last encoded key used for a find operation, provided this was cacheable
     * Used to avoid recomputing path down tree if will be the same.
     * @var string
     */
    public $last_find_key = null;
    /**
     * First key of next node after returned node for the last find operation,
     * provided this was cacheable
     * @var string
     */
    public $last_find_next_key = null;
    /**
     * Storage for root node of the B-Tree
     * @var object
     */
    public $parameters;
    /**
     * Array of column names for the columns in a PartitionDocumentBundle which
     * are of type SERIAL
     * @var array
     */
    public $serial_columns;
    /**
     * The PackedTableTools object used to pack and unpack records in
     * BPlusTree
     * @var object
     */
    public $table_tools;
    /**
     * Cache of key-values (internal_node_path => array of subnodes) used to
     * speed up tree traversal
     * @var array
     */
    public $tree_path_cache = [];
    /**
     * Creates/Loads B+-Tree having specified folder and minimum_degree.
     *
     * @param string $folder is the folder for storing the B+-Tree files
     * @param array $format the column names, keys and types for this
     *  B+-Tree  object
     * @param int $max_keys the maximum number of keys a node is allowed to hold
     * @param object $node_compressor_type
     *  seekquarry\yioop\library\compressors\Compressor object used to
     *  compress index node files.
     * @param object $blob_compressor_type
     *  seekquarry\yioop\library\compressors\Compressor object used to
     *  compress blob columns.
     */
    public function __construct($folder, $format =
        self::DEFAULT_PARAMETERS["FORMAT"], $max_keys = self::MAX_KEYS,
        $node_compressor_type = self::DEFAULT_COMPRESSOR,
        $blob_compressor_type = self::DEFAULT_COMPRESSOR)
    {
        // ensure $max_keys odd
        $max_keys = ($max_keys % 2 == 1) ? $max_keys : $max_keys + 1;
        $initial_parameters = self::DEFAULT_PARAMETERS;
        $initial_parameters["MAX_KEYS"] = $max_keys;
        $initial_parameters["NODE_COMPRESSOR"] = $node_compressor_type;
        $initial_parameters["BLOB_COMPRESSOR"] = $blob_compressor_type;
        $this->node_compressor = new $node_compressor_type();
        $this->blob_compressor = new $blob_compressor_type();
        $initial_parameters["FORMAT"] = $format;
        $this->instance_time = hrtime(true);
        $this->folder = $folder;
        $changed_parameters = false;
        if (!file_exists($folder)) {
            $changed_parameters = true;
            if(!mkdir($folder)) {
                return null;
            }
            file_put_contents($folder . "/" . self::NEXT_NODE_NAME,
                serialize(null));
        }
        $this->parameters = self::getParameterInfo($folder);
        foreach (self::DEFAULT_PARAMETERS as $field => $value) {
            if (!isset($this->parameters[$field])) {
                $this->parameters[$field] = $initial_parameters[$field];
                $changed_parameters = true;
            }
        }
        $format = $this->parameters["FORMAT"];
        $packed_table_format = [];
        $this->blob_columns = [];
        $this->serial_columns = [];
        $this->key_field = null;
        foreach ($format as $field_name => $type) {
            $upper_field_name = strtoupper($field_name);
            if ($upper_field_name == "PRIMARY KEY") {
                $packed_table_format["PRIMARY KEY"] = $type;
                $this->key_field = (is_array($type)) ? $type[0] : $type;
                continue;
            }
            $type = strtoupper($type);
            if (in_array($type, ["BLOB", "SERIAL"])) {
                $this->blob_columns[] = $field_name;
                if ($type == "SERIAL") {
                    $this->serial_columns[] = $field_name;
                }
                $packed_table_format[$field_name] = "INT";
            } else if (isset(PackedTableTools::TYPE_SYNONYMS[$type])) {
                $packed_table_format[$field_name] =
                    PackedTableTools::TYPE_SYNONYMS[$type];
            } else {
                return null;
            }
        }
        if (empty($this->key_field)) {
            return null;
        }
        if (!empty($this->blob_columns)) {
            $packed_table_format["LAST_BLOB_LEN"] = "INT";
        }
        $this->table_tools = new PackedTableTools($packed_table_format,
            $node_compressor_type);
        if ($changed_parameters) {
            $this->saveParameters();
        }
    }
    /**
     * Adds an entry to the BPlusTree. Entries added might not be fully
     * on disk until flushLastPutNode is called, as this memory tries to
     * avoid disk writes.
     *
     * @param array $row associative array of filed_name => values for each
     *  column field field this BPlusTree's signature has
     * @param bool $is_encoded_key whether in $row the primary key field has
     *  already been encoded (rawurlencode) or not
     * @param int $mode says what to do in the event that an entry with the
     *  given key already exists in the BPlusTree. Possibilities are either
     *  PackedTableTools::APPEND_MODE or PackedTableTools::REPLACE_MODE
     *  The former appends as a subrecord the nonkey fields of the entry
     *  to the existing list of sub-records associated to the given key, the
     *  later replaces the key value pair.
     * @return bool success (true) or failure (false)
     */
    public function put($row, $is_encoded_key = true,
        $mode = PackedTableTools::APPEND_MODE)
    {
        $key = $row[$this->key_field] ?? false;
        if ($key === false) {
            return false;
        }
        $encode_key = ($is_encoded_key) ? $key : rawurlencode($key);
        $insert_node_path = $this->find($encode_key, true);
        if(empty($insert_node_path)) {
            return false;
        }
        $table_tools = $this->table_tools;
        $this->flushLastPutNode($insert_node_path);
        if (!isset($this->insert_node_cache[$insert_node_path])) {
            $this->insert_node_cache[$insert_node_path] =
                $table_tools->load($insert_node_path, $mode) ?? [];
        }
        $insert_node = $this->insert_node_cache[$insert_node_path];
        $archive_filename = (empty($this->blob_columns)) ? "" :
            $this->archiveFilenameFromNodeFilename($insert_node_path);
        $this->putNode($row, $insert_node, $archive_filename, $is_encoded_key,
            $mode);
        if (count($insert_node) > $this->parameters["MAX_KEYS"]) {
            $this->flushLastPutNode();
            $this->splitRecordsInLeaf($insert_node_path, $insert_node);
            $this->insert_node_cache = [];
            $this->tree_path_cache = [];
            $parent_path = $this->getParentFolder($insert_node_path);
            $this->updateNodePath($parent_path);
            return true;
        }
        if (count($this->insert_node_cache) >= self::MAX_CACHE_SIZE) {
            $this->insert_node_cache = [];
        }
        $this->insert_node_cache[$insert_node_path] = $insert_node;
        $this->last_insert_node_path = $insert_node_path;
        return true;
    }
    /**
     * Saves the the BPlusTree $insert_node_path (if not empty) node to disk.
     * If $insert_node_path == "" then the last node put'ted to will be saved.
     *
     * @param string $insert_node_path name of node to save to disk.
     */
    public function flushLastPutNode($insert_node_path = "")
    {
        $last_insert_node_path = $this->last_insert_node_path ?? "";
        if (!empty($last_insert_node_path) &&
            $insert_node_path != $last_insert_node_path &&
            isset($this->insert_node_cache[$last_insert_node_path])) {
            $this->table_tools->save($last_insert_node_path,
                $this->insert_node_cache[$last_insert_node_path]);
            unset($this->insert_node_cache[$last_insert_node_path]);
            $this->last_insert_node_path = "";
        }
    }
    /**
     * Given an entry to add to a BPlusTree node, the index file of the node
     * (for non blob or serial fields), and the name of the archive file for
     * node (for blob or serial fields) stores the entry to the node.
     *
     * @param array $row associative array of filed_name => values for each
     *  column field field this BPlusTree's signature has
     * @param bool $is_encoded_key whether in $row the primary key field has
     *  already been encoded (rawurlencode) or not
     * @param array &$node an in-memory code of a node (will be modified by
     *  this method)
     * @param string $archive_filename the name of the archive file associated
     *  with this file used for blob or serial fields.
     * @param bool $is_encoded_key whether in $row the primary key field has
     *  already been encoded (rawurlencode) or not
     * @param int $mode says what to do in the event that an entry with the
     *  given key already exists in the BPlusTree. Possibilities are either
     *  PackedTableTools::APPEND_MODE or PackedTableTools::REPLACE_MODE
     *  The former appends as a subrecord the nonkey fields of the entry
     *  to the existing list of sub-records associated to the given key, the
     *  later replaces the key value pair.
     */
    public function putNode($row, &$node, $archive_filename,
        $is_encoded_key = true, $mode = PackedTableTools::APPEND_MODE)
    {
        $table_tools = $this->table_tools;
        $key = $row[$this->key_field] ?? false;
        $encode_key = ($is_encoded_key) ? $key : rawurlencode($key);
        $out_value = $this->writeBlobsAndPack($archive_filename, $row);
        $table_tools->add($node, $encode_key, $out_value,
            PackedTableTools::ADD_MEM_TABLE, $mode);
    }
    /**
     * For a non-leaf node checks if it needs to be split, and if so splits
     * and fixes keys up the tree to restore BPlus-Tree properties.
     *
     * @param string $folder to check if needs split
     * @return bool success (true) or failure (false)
     */
    public function updateNodePath($folder)
    {
        if (strncmp($folder, $this->folder, strlen($this->folder)) != 0) {
            return false;
        }
        $node_prefix = self::NODE_PREFIX;
        $nodes = glob("$folder/$node_prefix*");
        if (count($nodes) < $this->parameters["MAX_KEYS"]) {
            return true;
        }
        if ($folder == $this->folder) { //root case
            return $this->splitRootNode();
        }
        $len = strlen($folder);
        $num_nodes = count($nodes);
        $next_node_name = self::NEXT_NODE_NAME;
        $this->add_archive_cache = [null, "", -1];
        $this->get_archive_cache = [null, "", -1];
        $parent_folder = $this->getParentFolder($folder);
        $half_max = floor($num_nodes/2);
        $node_name = substr($nodes[$half_max], $len + 1);
        $new_node = "$parent_folder/$node_name";
        mkdir($new_node);
        $next_node = "$folder/$next_node_name";
        rename($next_node, "$new_node/$next_node_name");
        file_put_contents($next_node, serialize($node_name));
        $node = $nodes[$half_max];
        $node_name = substr($node, $len + 1);
        $archive_prefix = self::ARCHIVE_PREFIX;
        $least_node_name = self::LEAST_NODE_NAME;
        $len_node_prefix = strlen($node_prefix);
        $archive_name = $archive_prefix .
            substr($node_name, $len_node_prefix);
        rename($node, "$new_node/$least_node_name");
        $archive_filename = "$folder/$archive_name";
        if (file_exists($archive_filename)) {
            rename($archive_filename,
                "$new_node/$archive_prefix$least_node_name");
        }
        for ($i = $half_max + 1; $i < $num_nodes; $i++) {
            $node = $nodes[$i];
            $node_name = substr($node, $len + 1);
            rename($node, "$new_node/$node_name");
            $archive_name = $archive_prefix .
                substr($node_name, $len_node_prefix);
            $archive_filename = "$folder/$archive_name";
            if (file_exists($archive_filename)) {
                rename($archive_filename, "$new_node/$archive_name");
            }
        }
        return $this->updateNodePath($parent_folder);
    }
    /**
     * This method handles split the root node of a BPlusTree when it gets too
     * full. Unlike for other nodes when we can add a key in the parent, for the
     * root, we make a temporary folder and move all the root node's nodes to it
     * then rename this the the least node of the root, and call updateNodePath
     * on this to restore the BPlusTree property.
     *
     * @return @return bool success (true) or failure (false)
     */
    public function splitRootNode()
    {
        $this->last_find_folder = null;
        $this->last_find_key = null;
        $this->last_find_next_key = null;
        $folder = $this->folder;
        $this->add_archive_cache = [null, "", -1];
        $this->get_archive_cache = [null, "", -1];
        $len = strlen($folder);
        $node_prefix =  self::NODE_PREFIX;
        $temp_node_name = self::TEMP_NODE_NAME;
        $least_node_name = self::LEAST_NODE_NAME;
        $archive_prefix = self::ARCHIVE_PREFIX;
        $next_node_name = self::NEXT_NODE_NAME;
        $nodes = glob("$folder/$node_prefix*");
        $tmp_folder = "$folder/$temp_node_name";
        $least_node = "$folder/$least_node_name";
        $least_archive = "$folder/$archive_prefix$least_node_name";
        mkdir($tmp_folder);
        $next_node_name_path = "$folder/$next_node_name";
        if (!file_exists($next_node_name_path)) {
            file_put_contents($next_node_name_path, serialize(null));
        }
        $tmp_next_node_name_path = "$tmp_folder/$next_node_name";
        rename($next_node_name_path, $tmp_next_node_name_path);
        if (file_exists($least_node)) {
            rename($least_node, "$tmp_folder/$least_node_name");
            if (file_exists($least_archive)) {
                rename($least_archive,
                    "$tmp_folder/$archive_prefix$least_node_name");
            }
        }
        $len_node_prefix = strlen($node_prefix);
        foreach ($nodes as $node) {
            $node_name = substr($node, $len + 1);
            rename($node, "$tmp_folder/$node_name");
            $archive_name = $archive_prefix .
                substr($node_name, $len_node_prefix);
            $archive_filename = "$folder/$archive_name";
            if (file_exists($archive_filename)) {
                rename($archive_filename, "$tmp_folder/$archive_name");
            }
        }
        rename($tmp_folder, $least_node);
        return $this->updateNodePath($least_node);
    }
    /**
     * Splits BPlusTree leaf $node with path $node_path into two nodes each
     * with an equal number of keys.
     *
     * @param string $node_path path to file used to store a BPlusTree node
     *  Used to name files after split.
     * @param array BPlusTree node (associative array of pairs key =>
     *  record for key stored according to BPlusTree signature using
     *  PackedTableTools)
     */
    public function splitRecordsInLeaf($node_path, $node)
    {
        $this->last_find_folder = null;
        $this->last_find_key = null;
        $this->last_find_next_key = null;
        $parent_folder = $this->getParentFolder($node_path);
        $archive_prefix = self::ARCHIVE_PREFIX;
        $temp_node_name = self::TEMP_NODE_NAME;
        $node_prefix = self::NODE_PREFIX;
        $tmp_filename = "$parent_folder/$temp_node_name";
        $tmp_archive_filename = "$parent_folder/$archive_prefix$temp_node_name";
        $archive_filename = (empty($this->blob_columns)) ? "" :
            $this->archiveFilenameFromNodeFilename($node_path);
        $num_keys = count($node);
        $half_num = ceil($num_keys/2);
        $keys = array_keys($node);
        sort($keys);
        $half_key = $keys[$half_num];
        /* the str_replaces are because I was getting hard to trace crashes
           concerning nulls in file name.
         */
        $new_leaf_filename = str_replace("\x00", "0",
            "$parent_folder/$node_prefix$half_key");
        $new_archive_filename = str_replace("\x00", "0",
            "$parent_folder/$archive_prefix$half_key");
        $table_tools = $this->table_tools;
        $out_archive_filename = $tmp_archive_filename;
        $out_node = [];
        $flag = false;
        for ($i = 0; $i < $half_num; $i++) {
            $values = $this->getFromNode($keys[$i], $node, $archive_filename);
            if ($values) {
                foreach ($values as $value) {
                    $this->putNode($value, $out_node, $out_archive_filename);
                }
            }
        }
        $table_tools->save($tmp_filename, $out_node);
        $out_node = [];
        $out_archive_filename = $new_archive_filename;
        for ($i = $half_num; $i < $num_keys; $i++) {
            $values = $this->getFromNode($keys[$i], $node, $archive_filename);
            if ($values) {
                foreach ($values as $value) {
                    $this->putNode($value, $out_node, $out_archive_filename);
                }
            }
        }
        $table_tools->save($new_leaf_filename, $out_node);
        $this->add_archive_cache = [null, "", -1];
        $this->get_archive_cache = [null, "", -1];
        rename($tmp_filename, $node_path);
        if (!empty($this->blob_columns)) {
            rename($tmp_archive_filename, $archive_filename);
        }
    }
    /**
     * Returns the record associated with a $key as stored in the BPlusTree.
     * If $key is not present in tree then returns null
     *
     * @param string $key to look up in current BPlusTree
     * @param bool $is_encoded_key whether $key has been rawurlencode'd or not
     * @param bool $use_string_node rather than load in a node and split
     *   into keys for use of all keys, read in node as a string and uncompress
     *   but do not perform further decoding (faster if only doing reads).
     * @param bool $decode whether or not to unpack record
     * @param bool $look_up_blobs whether to leave in the record the
     *      offset and length for blob items or to look up the blob item
     * @param int $offset starting record item associated with this key
     *      to return
     * @param int $limit maximum number of record items associated with this
     *      key to return
     * @return array of items associated with this key in the BPlusTree
     *      if $look_up_blobs == false, then returns a pair
     *      ["ARCHIVE_FILE" => [filename where blob columns stored,
     *     array of items associated with this key]]
     */
    public function get($key, $is_encoded_key = true, $use_string_node = false,
        $decode = true, $look_up_blobs = true, $offset = 0, $limit = -1)
    {
        $table_tools = $this->table_tools;
        $key_node_filename = $this->find($key, $is_encoded_key);
        if (!$key_node_filename) {
            return null;
        }
        $archive_filename = (empty($this->blob_columns)) ? "" :
            $this->archiveFilenameFromNodeFilename($key_node_filename);
        if ($use_string_node) {
            $key_node = $table_tools->load($key_node_filename,
                $table_tools::AS_STRING_MODE, true);
        } else {
            $key_node = $table_tools->load($key_node_filename);
        }
        $rows = $this->getFromNode($key, $key_node, $archive_filename,
            $is_encoded_key, $use_string_node, $decode, $look_up_blobs, $offset,
            $limit);
        if (!$look_up_blobs) {
            return ["ARCHIVE_FILE" => $archive_filename, "ROWS" => $rows];
        }
        return $rows;
    }
    /**
     * Returns records associated with $key in $key_node and the file
     * associated with $archive_filename (for blob and serial column values)
     * if it exists.
     *
     * @param string $key the key to look up
     * @param array $key_node the BPlusTree index node in which to look up the
     *  key
     * @param string $archive_filename file name of archive data (blob and
     *  serial columns) associated with $key_node
     * @param bool $is_encoded_key whether the key is rawurlencoded or not
     * @param bool $decode  whether to $decode the records or leave them packed
     * @param bool $look_up_blobs whether to look up blob and serial columns
     * @param int $offset index of the first record associated with $key to
     *  return
     * @param int $limit maximum number of records associated with key to return
     * @return array of value records associated with $key
     */
    public function getFromNode($key, $key_node, $archive_filename,
        $is_encoded_key = true, $is_string_node = false, $decode = true,
        $look_up_blobs = true, $offset = 0, $limit = -1)
    {
        $table_tools = $this->table_tools;
        $encode_key = ($is_encoded_key) ? $key : rawurlencode($key);
        if ($is_string_node) {
            $values = $table_tools->findRowFromKeyTableString($key_node,
                 $encode_key);
        } else {
            $values = $table_tools->find($key_node, $encode_key);
        }
        if (!$values) {
            return null;
        }
        if (!$decode) {
            return $values;
        }
        if (!($values = $table_tools->unpack($values, $offset, $limit))) {
            crawlLog("Unpack BPlusTree error!!! (Key,offset,limit) was: ".
                "($key, $offset, $limit) ..");
            $value_message = (is_string($values)) ? toHexString($values) :
                serialize($values);
            crawlLog(".. value was:" . $value_message);
            return null;
        }
        $num_unpacked = count($values);
        if (!$look_up_blobs) {
            return $values;
        }
        $num_blob_columns = count($this->blob_columns);
        for ($k = 0; $k < $num_unpacked; $k++) {
            if ($num_blob_columns > 0) {
                $blob_offset = intval($values[$k][$this->blob_columns[0]]);
                for ($i = 0; $i < $num_blob_columns; $i++) {
                    $column_name = $this->blob_columns[$i];
                    $len = ($i + 1 < $num_blob_columns) ?
                        intval($values[$k][$this->blob_columns[$i + 1]]) :
                        $values[$k]["LAST_BLOB_LEN"];
                    $values[$k][$column_name] = ($len == 0) ? "" :
                        $this->getArchive($archive_filename, $blob_offset,
                        $len);
                    $blob_offset += $len;
                }
                unset($values[$k]["LAST_BLOB_LEN"]);
                foreach ($this->serial_columns as $field_name) {
                    $values[$k][$field_name] = unserialize(
                        $values[$k][$field_name]);
                }
            }
            $values[$k][$this->key_field] = ($is_encoded_key) ?
                $key : rawurldecode($encode_key);
        }
        return $values;
    }
    /**
     * Returns the file path to the file that may or may not contain $key
     * in the BPlusTree
     *
     * @param string $key to look up in BPlusTree
     * @param bool $is_encoded_key whether $key has been rawurlencode'd or not
     * @return string file path to node in BPlusTree that may or may not contain
     *      this key
     */
    public function find($key, $is_encoded_key = false)
    {
        $encode_key = ($is_encoded_key) ? $key : rawurlencode($key);
        if (!empty($this->last_find_folder) && !empty($this->last_find_key) &&
            $this->last_find_key <= $encode_key
            && !empty($this->last_find_next_key) &&
            $encode_key < $this->last_find_next_key) {
                return $this->last_find_folder;
        }
        $current_folder = $this->folder;
        $cache = & $this->tree_path_cache;
        $node_prefix = self::NODE_PREFIX;
        $least_node_name = self::LEAST_NODE_NAME;
        $node_prefix_and_key = self::NODE_PREFIX . $encode_key;
        while (isset($cache[$current_folder]) || is_dir($current_folder)) {
            $current_prefix = "$current_folder/$node_prefix";
            $len_current_prefix = strlen($current_prefix);
            if (!isset($cache[$current_folder])) {
                $cache[$current_folder] = glob("$current_prefix*");
            }
            $nodes = $cache[$current_folder];
            if (empty($nodes)) {
                return "$current_folder/$least_node_name";
            }
            if ($nodes == $current_folder) {
                break;
            }
            $exact_node = "$current_folder/$node_prefix_and_key";
            $least_node = "$current_folder/$least_node_name";
            $first_index = 0;
            $last_index = count($nodes) - 1;
            $this->last_find_folder = null;
            $this->last_find_key = null;
            $this->last_find_next_key = null;
            if ($exact_node < $nodes[$first_index]) {
                $this->last_find_folder = $least_node;
                $this->last_find_key = $encode_key;
                $this->last_find_next_key = substr($nodes[$first_index],
                    $len_current_prefix);
                $current_folder = $least_node;
            } else if ($exact_node == $nodes[$first_index]) {
                if (!empty($nodes[$first_index + 1])) {
                    $this->last_find_folder = $nodes[$first_index];
                    $this->last_find_key = $encode_key;
                    $this->last_find_next_key = substr($nodes[$first_index + 1],
                        $len_current_prefix);
                }
                $current_folder = $nodes[$first_index];
            } else if ($nodes[$last_index] <= $exact_node) {
                $current_folder = $nodes[$last_index];
            } else {
                while ($first_index < $last_index) {
                    $mid_index = ceil(($first_index + $last_index)/2);
                    if ($nodes[$mid_index] > $exact_node) {
                        $last_index = $mid_index - 1;
                    } else {
                        $first_index = $mid_index;
                    }
                }
                if (!empty($nodes[$first_index + 1])) {
                    $this->last_find_folder = $nodes[$first_index];
                    $this->last_find_key = $encode_key;
                    $this->last_find_next_key = substr($nodes[$first_index + 1],
                        $len_current_prefix);
                }
                $current_folder = $nodes[$first_index];
            }
        }
        $return_folder =  null;
        if ($nodes == $current_folder) {
            $return_folder =  $current_folder;
        } else if (file_exists($current_folder)) {
            if (count($cache) >= self::MAX_CACHE_SIZE) {
                $this->tree_path_cache = [];
            }
            $cache[$current_folder] = $current_folder;
            $return_folder =  $current_folder;
        } else {
            $this->last_find_folder = null;
            $this->last_find_key = null;
            $this->last_find_next_key = null;
        }
        return $return_folder;
    }
    /**
     * Given the path to an index node file in the BPlusTree compute the
     * path to the corresponding archive node file.
     *
     * @param string $node_filename of index node path
     * @return string associated archive node path
     */
    public function archiveFilenameFromNodeFilename($node_filename)
    {
        $parent_folder = $this->getParentFolder($node_filename);
        $len = strlen($parent_folder);
        $node_name = substr($node_filename, $len + 1);
        if ($node_name == self::LEAST_NODE_NAME) {
             return $parent_folder. "/" . self::ARCHIVE_PREFIX .
                self::LEAST_NODE_NAME;
        }
        return $parent_folder . "/" . self::ARCHIVE_PREFIX .
            substr($node_name, strlen(self::NODE_PREFIX));
    }
    /**
     * Return the blob item from $archive_filename at $offset of length $len,
     * uncompress the result
     *
     * @param string $archive_filename path to an archive node file for this
     *  BPlusTree
     * @param int $offset byte offset into archive node file file
     * @param int $len length of blob item
     * @return string uncompressed blob item from $archive_filename
     */
    public function getArchive($archive_filename, $offset, $len)
    {
        list($fh, $previous_archive_filename, $previous_instance_time) =
            $this->get_archive_cache;
        $blob_compressor = $this->blob_compressor;
        if (!$fh || $previous_archive_filename != $archive_filename ||
            $previous_instance_time != $this->instance_time) {
            if ($fh) {
                fclose($fh);
            }
            $fh = fopen($archive_filename , "r");
            $previous_archive_filename = $archive_filename;
            $previous_instance_time = $this->instance_time;
        }
        if (empty($blob_compressor)) {
            $blob_compress_type = $this->parameters["BLOB_COMPRESSOR"];
            $blob_compressor = new $blob_compress_type();
        }
        $value = false;
        if ($fh && fseek($fh, $offset) == 0 && $len > 0) {
            $compressed_file = fread($fh, $len);
            $value = $blob_compressor->uncompress($compressed_file);
        }
        $this->get_archive_cache = [$fh, $previous_archive_filename,
            $previous_instance_time];
        return $value;
    }
    /**
     * Given a encoded hash_key to be used as the name of in the name of
     * an archive file for a BPlusTree node and a folder where that
     * node should be stored, return the path name for an archive file for
     * a node
     * @param string $current_folder to store archive file in
     * @param string $encode_key encode key value to be used in the file
     *  name of a BPlusTree node archive file
     * @return string archive file path
     */
    protected function getArchiveName($current_folder, $encode_key)
    {
        $archive_filename = $current_folder . "/" . self::ARCHIVE_PREFIX .
            $encode_key;
        return $archive_filename;
    }
    /**
     * Write the blob and serial columns from a row to insert into BPlusTree
     * and then pack the rest of the row as a string
     *
     * @param string $archive_filename name of archive file for a BPlusTree node
     * @param array $row BPlusTree row to pack Blob columns for
     * @return string packed row
     */
    protected function writeBlobsAndPack($archive_filename, $row)
    {
        $blob_columns = $this->blob_columns ?? [];
        $serial_columns = $this->serial_columns ?? [];
        $format = $this->parameters["FORMAT"];
        unset($format['PRIMARY KEY']);
        $format_keys = array_keys($format);
        $out_value = [];
        foreach ($format_keys as $format_key) {
            $out_value[$format_key] = $row[$format_key] ?? null;
        }
        if (!empty($blob_columns)) {
            $old_offset = 0;
            foreach ($blob_columns as $blob_column) {
                $blob = $out_value[$blob_column] ?? "";
                if (empty($blob)) {
                    $offset = $old_offset;
                    $len = 0;
                } else {
                    if (in_array($blob_column, $serial_columns)) {
                        $blob = serialize($blob);
                    }
                    if (($add_info = $this->addArchive($archive_filename,
                        $blob)) === false) {
                        return false;
                    }
                    list($offset, $len) = $add_info;
                }
                $out_value[$blob_column] = $offset - $old_offset;
                $old_offset = $offset;
            }
            $out_value["LAST_BLOB_LEN"] = $len;
        }
        $out_value = $this->table_tools->pack($out_value);
        return $out_value;
    }
    /**
     * Add a value to an archive file $archive_filename of a B+-tree node
     * @param string $archive_filename name of archive file
     * @param string $value value to add
     * @return array [offset into archive, length data saved]
     */
    protected function addArchive($archive_filename, $value)
    {
        list($fh, $previous_archive_filename,
            $previous_instance_time) =  $this->add_archive_cache;
        if (!$fh || $previous_archive_filename != $archive_filename ||
            $previous_instance_time != $this->instance_time) {
            if ($fh) {
                fclose($fh);
            }
            $fh = fopen($archive_filename , "c+");
            $previous_archive_filename = $archive_filename;
            $previous_instance_time = $this->instance_time;
        }
        $blob_compress_type = $this->parameters["BLOB_COMPRESSOR"];
        $blob_compressor = new $blob_compress_type();
        fseek($fh, 0, SEEK_END);
        $offset = ftell($fh);
        $compressed_value = $blob_compressor->compress($value);
        $len = strlen($compressed_value);
        $success = (fwrite($fh, $compressed_value, $len) !== false) ? true :
            false;
        $this->add_archive_cache = [$fh, $previous_archive_filename,
            $previous_instance_time];
        if (!$success) {
            return false;
        }
        return [$offset, $len];
    }
    /**
     * Returns the file path of the parent node of the current internal node
     *
     * @param string $folder path to internal BPlusTree node folder.
     *  (Internal nodes are folders, leaves are files)
     * @return string file_path to parent or false, if no parent.
     */
    public function getParentFolder($folder)
    {
        if (!($last_pos = strrpos($folder, "/"))) {
            return false;
        }
        return substr($folder, 0, $last_pos);
    }
    /**
     * Save the operating parameters of this BPlusTree
     */
    public function saveParameters()
    {
        $parameter_path = $this->folder . "/" . self::PARAMETERS_FILE;
        file_put_contents($parameter_path, serialize($this->parameters),
            LOCK_EX);
    }
    /**
     * Returns the parameters (such as its signature, max keys per nodes, etc)
     * used to configure the BPlusTree stored at $folder
     *
     * @param string $folder file path to a stored BPlusTree
     * @return array configuration info about the BPlusTree
     */
    public static function getParameterInfo($folder)
    {
        $parameter_path = $folder . "/" . self::PARAMETERS_FILE;
        if(file_exists($parameter_path)) {
            $parameters = unserialize(file_get_contents($parameter_path)) ?? [];
            if (!empty($parameters["COMPRESSOR"])) {
                //original format didn't distinguish between compressor use
                $parameters["NODE_COMPRESSOR"] ??= $parameters["COMPRESSOR"];
                $parameters["BLOB_COMPRESSOR"] ??= $parameters["COMPRESSOR"];
            }
            return $parameters;
        } else {
            return [];
        }
    }
}
ViewGit