Last commit for tests/LSMTreeTest.php: c8033814411ec8fa001b23b27d8e6da89a8e867c

Fix LSMTreeTest unit tests, add more documentation for LSMTree, fix cache in IndexDocumentBundle::getPostingsString

Chris Pollett [2024-01-17 23:Jan:th]
Fix LSMTreeTest unit tests, add more documentation for LSMTree, fix cache in IndexDocumentBundle::getPostingsString
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2024  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * This file contains unit tests of the LSMTree class used to keep track of
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2024
 * @filesource
 */
namespace seekquarry\yioop\tests;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\models\Model;
use seekquarry\yioop\library\LSMTree;
use seekquarry\yioop\library\PackedTableTools;
use seekquarry\yioop\library\UnitTest;

/**
 * Yioop LSMTree Unit Class
 *
 * Used to test insert, lookup, and deletion of key-value pairs in a
 * Log-structured Merge Tree.
 * @author Chris Pollett
 */
class LSMTreeTest extends UnitTest
{
    /**
     * Test directory to hold LSMTree used for these unit tests
     */
    const TEST_DIR = '/test_files/lsm_tree_test';
    /**
     * Folder names to use for test LSMTree
     * @param string
     */
    public $table_dirs;
    /**
     * Minimum degree is set to 2 and the number of key-value pairs is set to 25
     */
    public function setUp()
    {
        $this->table_dirs = [];
    }
    /**
     * Delete the LSMTree files created during the test
     */
    public function tearDown()
    {
        $model = new Model();
        foreach ($this->table_dirs as $table_dir) {
            $model->db->unlinkRecursive($table_dir, true);
        }
        $this->table_dirs = [];
    }
    /**
     * Used to create a single hash table in the folder
     * TEST_DIR . $max_items_per_file which allows at most
     * $max_items_per_file to be stored in a bucket
     *
     * @param int $max_items_per_file number of items allowed to be stored in a
     *  bucket
     */
    public function createTree($format, $max_file_size, $block_factor,
        $suffix = "")
    {
        $key_size = $format["PRIMARY KEY"][1];
        $table_dir = __DIR__ . self::TEST_DIR .
            "_{$key_size}_{$max_file_size}_{$block_factor}_$suffix";
        $this->table_dirs[] = $table_dir;
        return new LSMTree($table_dir, $format, $max_file_size,
            $block_factor);
    }
    /**
     * Test putting items in lsm-tree and
     * then seeing if the items can be retrieved
     */
    public function simplePutGetTierTestCase()
    {
        for ($max_file_size = 20; $max_file_size < 60; $max_file_size += 30) {
            for ($block_factor = 1; $block_factor < 3; $block_factor++) {
                for ($key_len = 16; $key_len > 5; $key_len -= 8) {
                    for ($cols = 0; $cols < 3; $cols++) {
                        $format = ["PRIMARY KEY" => ["KEY", $key_len],
                            "VALUE" => "TEXT"];
                        for ($j = 0; $j < $cols; $j++) {
                            $format["COL$j"] = "INT";
                        }
                        $lsm_tree = $this->createTree($format, $max_file_size,
                            $block_factor, "{$key_len}_$cols");
                        for ($i = 0; $i < 50; $i++) {
                            $key = str_pad("key$i", $key_len, "0",
                                STR_PAD_LEFT);
                            $entry = ["KEY" => $key, "VALUE" => "value$i"];
                            for ($j = 0; $j < $cols; $j++) {
                                $entry["COL$j"] = $i;
                            }
                            $lsm_tree->put($entry);
                        }
                        $lsm_tree->flush();
                        for  ($i = 0; $i < 50; $i++) {
                            $key = str_pad("key$i", $key_len, "0",
                                STR_PAD_LEFT);
                            $values = $lsm_tree->get($key);
                            $this->assertEqual("value$i", $values[0]["VALUE"],
                                "{$i}th insert key length $key_len, pad ".
                                "columns $j into single LSTM Tier ".
                                "{$values[0]["VALUE"]} should be retrieved ".
                                "as value$i");
                        }
                    }
                }
            }
        }
    }
    /**
     * Checks that the correct number of block folders are created when
     * inserting several items into am LSMTree with a small block factor
     * and max file size
     */
    public function blockFactorTestCase()
    {
        $max_file_size = 20;
        $block_factor = 2;
        $key_len = 16;
        $cols = 2;
        $format = ["PRIMARY KEY" => ["KEY", $key_len],
            "VALUE" => "TEXT"];
        $lsm_tree = $this->createTree($format, $max_file_size,
            $block_factor, "{$key_len}");
        for ($i = 0; $i < 15; $i++) {
            $key = str_pad("key$i", $key_len, "0",
                STR_PAD_LEFT);
            $entry = ["KEY" => $key, "VALUE" => "value$i"];
            $lsm_tree->put($entry);
        }
        $this->assertTrue(file_exists($lsm_tree->folder .
            "/Tier0000/A/F00000000000key12") && !file_exists($lsm_tree->folder .
                "/Tier0000/A/F00000000000key14"),
            "Correct number of block folders created");
    }
    /**
     * Checks that the correct number of data files are created when
     * inserting several items into am LSMTree with a small max file size
     */
    public function maxFileSizeTestCase()
    {
        $max_file_size = 50;
        $block_factor = 10000;
        $key_len = 16;
        $cols = 2;
        $format = ["PRIMARY KEY" => ["KEY", $key_len],
            "VALUE" => "TEXT"];
        $lsm_tree = $this->createTree($format, $max_file_size,
            $block_factor, "{$key_len}");
        for ($i = 0; $i < 15; $i++) {
            $key = str_pad("key$i", $key_len, "0",
                STR_PAD_LEFT);
            $entry = ["KEY" => $key, "VALUE" => "value$i"];
            $lsm_tree->put($entry);
        }
        $block_folder = $lsm_tree->folder . "/Tier0000/A/F000000000000key0";
        $this->assertTrue(file_exists("$block_folder/D00000000000key13") &&
            !file_exists("$block_folder/D00000000000key14"),
            "Correct number of data files created");
    }
    /**
     * Tests that values for identical keys are correctly merged when merge
     * two slots within a tier.
     */
    public function mergeRecordsTierTestCase()
    {
        $max_file_size = 50;
        $block_factor = 2;
        $key_len = 16;
        $cols = 2;
        $format = ["PRIMARY KEY" => ["KEY", $key_len],
            "VALUE" => "TEXT"];
        for ($j = 0; $j < $cols; $j++) {
            $format["COL$j"] = "INT";
        }
        $lsm_tree = $this->createTree($format, $max_file_size,
            $block_factor, "{$key_len}_$cols");
        foreach ([["A", 0, 10], ["B", 5, 15]] as $slot_info) {
            $lsm_tree->selectPutSlot($slot_info[0]);
            for ($i = $slot_info[1]; $i < $slot_info[2]; $i++) {
                $key = str_pad("key$i", $key_len, "0",
                    STR_PAD_LEFT);
                $entry = ["KEY" => $key, "VALUE" => "value{$slot_info[0]}$i"];
                for ($j = 0; $j < $cols; $j++) {
                    $entry["COL$j"] = $i;
                }
                $lsm_tree->put($entry);
            }
            $lsm_tree->flush();
        }
        $lsm_tree->mergeTiers();
        $key = str_pad("key10", $key_len, "0", STR_PAD_LEFT);
        $values = $lsm_tree->get($key);
        for ($i = 0; $i < 15; $i++) {
            $key = str_pad("key$i", $key_len, "0", STR_PAD_LEFT);
            $values = $lsm_tree->get($key);
            if ($i < 10) {
                $this->assertEqual("valueA$i", $values[0]["VALUE"],
                    "Able to retrieve Slot A value $i");
            }
            if ($i > 4 && $i < 10) {
                $this->assertEqual("valueB$i", $values[1]["VALUE"],
                    "Able to retrieve Slot B value $i");
            }
            if ($i >= 10) {
                $this->assertEqual("valueB$i", $values[0]["VALUE"],
                    "Able to retrieve Slot B value $i");
            }
        }
    }
    /**
     * Tests that tiers are correctly created and merged when many items are
     * inserted into a tree with a small block factor and max file size.
     * Test correctness by seeing if each value can be retrieved with its key.
     */
    public function mergeTiersTestCase()
    {
        $max_file_size = 50;
        $block_factor = 2;
        $key_len = 16;
        $cols = 3;
        $format = ["PRIMARY KEY" => ["KEY", $key_len],
            "VALUE" => "TEXT"];
        for ($j = 0; $j < $cols; $j++) {
            $format["COL$j"] = "INT";
        }
        $lsm_tree = $this->createTree($format, $max_file_size,
            $block_factor, "{$key_len}_$cols");
        $slot = "B";
        for ($i = 0; $i < 250; $i++) {
            if ($i % 5 == 0) {
                $slot = ($slot == "B") ? "A" : "B";
                $lsm_tree->flush();
                $lsm_tree->selectPutSlot($slot);
            }
            if ($i > 0 && $i % 10 == 0) {
                $lsm_tree->mergeTiers();
            }
            $k = 10 * floor($i / 10);
            $k += ($slot == "A") ? 2 * ($i % 10) : 2 * (($i % 10) - 5) + 1;
            $key = str_pad("key$k", $key_len, "0", STR_PAD_LEFT);
            $entry = ["KEY" => $key, "VALUE" => "value$k"];
            for ($j = 0; $j < $cols; $j++) {
                $entry["COL$j"] = $k;
            }
            $lsm_tree->put($entry);
        }
        $lsm_tree->flush();
        $lsm_tree->mergeTiers();
        $this->assertEqual($lsm_tree->getMaxTier(), 5,
            "Final maxTiers is correct");
        $tiers = [false, true, false, false, true, true];
        for ($i = 0; $i <= 5; $i++) {
            $this->assertEqual($lsm_tree->occupiedTier($i), $tiers[$i],
                "{$i} th tier correctly occupied");
        }
        for ($i = 0; $i < 250; $i++) {
            $key = str_pad("key$i", $key_len, "0", STR_PAD_LEFT);
            $values = $lsm_tree->get($key);
            $this->assertEqual("value$i", $values[0]["VALUE"],
                "Able to retrieve {$i}th value");
        }
    }
}
ViewGit