First pass at modifying Yioop to again use Logarithmic Merge Trees for its dictionary structures

Chris Pollett [2024-01-15 02:Jan:th]
First pass at modifying Yioop to again use Logarithmic Merge Trees for its dictionary structures
Folder structure of IndexDocumentBundles also modified and now supports overflow folder (which
could be on a different hard drive). ArcTool has been updated to support migration to new
indexes
Filename
src/executables/ArcTool.php
src/executables/DictionaryUpdater.php
src/library/CrawlQueueBundle.php
src/library/IndexDocumentBundle.php
src/library/IndexManager.php
src/library/LSMTree.php
src/library/PackedTableTools.php
src/library/PartitionDocumentBundle.php
src/library/index_bundle_iterators/WordIterator.php
tests/BPlusTreeTest.php
tests/IndexDocumentBundleTest.php
tests/IndexManagerTest.php
tests/LSMTreeTest.php
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php
index ab522e802..cded12a3a 100755
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
@@ -177,7 +177,10 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants
                 $this->makeFilter($argv[2], $argv[3], $argv[4]);
                 break;
             case "migrate":
-                $this->migrateIndexArchive($path);
+                if (!isset($argv[3])) {
+                    $argv[3] = 1;
+                }
+                $this->migrateIndexArchive($path, $argv[3]);
                 break;
             case "partition":
                 $this->outputPartitionInfo($path, $argv[3]);
@@ -195,7 +198,16 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants
                 if (!isset($argv[3])) {
                     $argv[3] = 0;
                 }
-                $this->rebuildIndexBundle($path, $argv[3]);
+                if (!isset($argv[4])) {
+                    $argv[4] = 1;
+                }
+                $this->rebuildIndexBundle($path, $argv[3], true, $argv[4]);
+                break;
+            case "remerge":
+                if (!isset($argv[3])) {
+                    $argv[3] = 0;
+                }
+                $this->rebuildIndexBundle($path, $argv[3], false);
                 break;
             case "show":
                 if (!isset($argv[3])) {
@@ -1119,10 +1131,26 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants
      * after the process to "Old" . name_of_old_bundle
      *
      * @param string $archive_path file path to a IndexArchiveBundle
+     * @param int $number_of_processes number of processes that should be
+     *  used to perform the migration (more is faster, but this is only used
+     *  for migration of indexes between 3.0 and 4.0)
      */
-    public function migrateIndexArchive($archive_path)
+    public function migrateIndexArchive($archive_path, $number_of_processes = 1)
     {
         $archive_type = $this->getArchiveKind($archive_path);
+        if ($archive_type == "IndexDocumentBundle") {
+            $version = IndexManager::getVersion($archive_path);
+            if ($version < "4.0") {
+                echo "Checking if bundle upgradable...\n";
+                echo "--------------------------------\n";
+                echo "...IndexDocumentBundle Version $version detected!\n";
+                echo "...Upgrading bundle to version ".
+                    IndexDocumentBundle::DEFAULT_VERSION . "...\n";
+                $this->migrateIndexDocument($archive_path,
+                    $number_of_processes);
+                return;
+            }
+        }
         $archive_type_use = ($archive_type == "DoubleIndexBundleOld") ?
             "DoubleIndexBundle" : $archive_type;
         $archive_name = C\NS_LIB . $archive_type_use;
@@ -1272,6 +1300,95 @@ to use it again.

 EOD;
     }
+    /**
+     * Migrates a Yioop >=3 index to the most recent index format of Yioop.
+     *
+     * @param string $archive_path path to a Yioop IndexDocumentBundle to
+     *   migrate
+     * @param int $number_of_processes number of computer processes to use
+     *   to performt the migration (more should be faster)
+     */
+    public function migrateIndexDocument($archive_path, $number_of_processes=1)
+    {
+        $new_documents_folder = $archive_path . "/" .
+            IndexDocumentBundle::DOCUMENTS_FOLDER;
+        $old_documents_folder = "$archive_path/documents";
+        if (!file_exists($old_documents_folder)) {
+            echo "...Could not find any documents in bundle. Stopping!\n";
+            return;
+        }
+        rename($old_documents_folder, $old_documents_folder . "-old");
+        $old_documents_folder .= "-old";
+        if (!file_exists($new_documents_folder)) {
+            echo "...Creating new Documents folder\n";
+            mkdir($new_documents_folder);
+            chmod($new_documents_folder, 0777);
+        }
+        $old_param_file = "$old_documents_folder/pdb_parameters.txt";
+        if (file_exists($old_param_file)) {
+            $pdb_parameters = unserialize(file_get_contents($old_param_file));
+            $pdb_parameters["OVERFLOW_THRESHOLD"] = C\OVERFLOW_THRESHOLD;
+            $pdb_parameters["PARTITION_SIZE_THRESHOLD"] =
+                PartitionDocumentBundle::PARTITION_SIZE_THRESHOLD;
+            file_put_contents("$new_documents_folder/pdb_parameters.txt",
+                serialize($pdb_parameters));
+        }
+        $old_partitions = glob("$old_documents_folder/partition_*.txt.gz");
+        $i = 0;
+        $folder_path = $new_documents_folder;
+        foreach ($old_partitions as $old_partition) {
+            $block = sprintf("%'.05d",
+                floor($i / PartitionDocumentBundle::BLOCK_FACTOR));;
+            if (C\OVERFLOW_THRESHOLD > 0 &&
+                $old_partition > C\OVERFLOW_THRESHOLD) {
+                $folder_path .=
+                    "/" . PartitionDocumentBundle::OVERFLOW_DIR_FOLDER;
+                if (!file_exists($folder_path)) {
+                    mkdir($folder_path);
+                    chmod($folder_path, 0777);
+                }
+            }
+            $block_path = "$new_documents_folder/".
+                PartitionDocumentBundle::BLOCK_PREFIX . $block;
+            if (!file_exists($block_path)) {
+                mkdir($block_path);
+                chmod($block_path, 0777);
+            }
+            $partition_path = "$block_path/".
+                PartitionDocumentBundle::PARTITION_PREFIX .
+                sprintf("%'.010d", $i);
+            if (!file_exists($partition_path)) {
+                mkdir($partition_path);
+                chmod($partition_path, 0777);
+            }
+            $dir_name =  pathinfo($old_partition, PATHINFO_DIRNAME);
+            $base_name =  pathinfo($old_partition, PATHINFO_BASENAME);
+            $file_path = substr($old_partition, 0, -strlen(".txt.gz"));
+            rename($old_partition, "$partition_path/" .
+                PartitionDocumentBundle::ARCHIVE_FILENAME . ".txt.gz");
+            if (file_exists("$file_path.ix")) {
+                rename("$file_path.ix", "$partition_path/" .
+                    PartitionDocumentBundle::INDEX_FILENAME);
+            }
+            $i++;
+        }
+        $dbms_manager = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager";
+        $db = new $dbms_manager();
+        $db->unlinkRecursive($archive_path . "/" .
+            IndexDocumentBundle::DICTIONARY_FOLDER, true);
+        $db->unlinkRecursive($archive_path . "/positions_doc_maps", true);
+        $db->unlinkRecursive($old_documents_folder);
+        echo "...Building Dictionary in new format:\n";
+        $this->rebuildIndexBundle($archive_path, 0, true, $number_of_processes);
+        echo "...Dictionary Rebuilt!\n";
+        $archive_info = unserialize(file_get_contents($archive_path . "/" .
+            IndexDocumentBundle::ARCHIVE_INFO_FILE));
+        $archive_info["VERSION"] = IndexDocumentBundle::DEFAULT_VERSION;
+        file_put_contents($archive_path . "/" .
+            IndexDocumentBundle::ARCHIVE_INFO_FILE, serialize($archive_info));
+        echo "...Updating bundles version number!\n";
+        echo "Bundle Upgrade Complete!\n";
+    }
     /**
      * Used to recompute both the index shards and the dictionary
      * of an index archive. The first step involves re-extracting the
@@ -1282,10 +1399,21 @@ EOD;
      * @param mixed $start_generation which web archive generation to start
      *  rebuild from. If 'continue' then keeps going from where last attempt at
      *  a rebuild was.
+     * @param bool $force_recompute whether to force the recomputation of
+     *   partition inverted indexes or to try to use existing ones if present
+     * @param int $number_of_processes number of CPU processes to use
+     *   when trying to recompute partition inverted indexes
      */
-    public function rebuildIndexBundle($archive_path, $start_generation = 0)
+    public function rebuildIndexBundle($archive_path, $start_generation = 0,
+        $force_recompute = true, $number_of_processes = 1)
     {
-        L\crawlLog("Rebuilding index!!");
+        $rebuilding = ($force_recompute) ? "Rebuilding" : "Remerging";
+        $rebuild = ($force_recompute) ? "rebuild" : "remerge";
+        $sleep_time = ($force_recompute) ? 15 : 3;
+        if (!$force_recompute) {
+            $number_of_processes = 1;
+        }
+        L\crawlLog("$rebuilding index using $number_of_processes processes!!");
         $bundle_num = -1;
         $bundle_path = $archive_path;
         if (preg_match("/\-\d$/", $archive_path)) {
@@ -1308,7 +1436,7 @@ EOD;
             if (file_exists($next_partition_path)) {
                 $start_generation =
                     intval(file_get_contents($next_partition_path));
-                echo "Restarting rebuild index from $start_generation\n";
+                echo "Restarting $rebuild index from $start_generation\n";
             } else {
                 $start_generation = 0;
             }
@@ -1320,8 +1448,6 @@ EOD;
         if ($start_generation == 0) {
             $db->unlinkRecursive($bundle_path . "/" .
                 IndexDocumentBundle::DICTIONARY_FOLDER, false);
-            $db->unlinkRecursive($bundle_path . "/" .
-                IndexDocumentBundle::POSITIONS_DOC_MAP_FOLDER, false);
         }
         file_put_contents($next_partition_path, $start_generation);
         if ($archive_type == "FeedDocumentBundle") {
@@ -1346,42 +1472,93 @@ EOD;
         $save_partition = $index_archive->documents->parameters[
             "SAVE_PARTITION"] ?? 0;
         L\crawlLog("Save partition is $save_partition");
-        $options = "run 0 '$bundle_path'";
+        $updater_option =  ($force_recompute) ? "rebuild_%d" : "remerge";
+        $options = "run %d '$bundle_path' $updater_option";
         $old_next_partition = -1;
         $next_partition = $start_generation;
         $continue = false;
-        $dictionary_log = C\LOG_DIR . "/0-DictionaryUpdater.log";
-        $fp = fopen($dictionary_log, "w");
-        fclose($fp);
+        $recent_log_times = [];
+        $dictionary_logs = [];
+        for ($i = 0; $i < $number_of_processes; $i++) {
+            $dictionary_logs[$i] = C\LOG_DIR . "/$i-DictionaryUpdater.log";
+            $fp = fopen($dictionary_logs[$i], "w");
+            fclose($fp);
+            $recent_log_times[$i] = time();
+        }
+        $rebuild_dones = [];
         while ($next_partition < $save_partition) {
             if ($old_next_partition != $next_partition) {
                 $old_next_partition = $next_partition;
-                L\crawlLog("Begin Processing Partition: $next_partition");
-                L\crawlLog("Exec'ing DictionaryUpdater with parameters: " .
-                    $options);
-                L\CrawlDaemon::execScriptInOwnProcess(C\BASE_DIR .
-                    "/executables/DictionaryUpdater.php", $options);
+                $num_forks = min($save_partition - $next_partition,
+                    $number_of_processes);
+                echo "Num forks:$num_forks, num processes $number_of_processes\n";
+                for ($i = 0; $i < $num_forks; $i++) {
+                    $process_partition = $next_partition + $i;
+                    $process_options = sprintf($options, $i,
+                        $process_partition);
+                    L\crawlLog(
+                        "Begin Processing Partition: $process_partition");
+                    L\crawlLog("Exec'ing DictionaryUpdater with parameters: " .
+                        $process_options);
+                    L\CrawlDaemon::execScriptInOwnProcess(C\BASE_DIR .
+                        "/executables/DictionaryUpdater.php", $process_options);
+                }
             } else {
-                if (file_exists($dictionary_log)) {
-                    $recent_log_data = file_get_contents($dictionary_log);
-                    echo $recent_log_data;
-                    $fp = fopen($dictionary_log, "w");
-                    fclose($fp);
+                for ($i = 0; $i < $num_forks; $i++) {
+                    if (file_exists($dictionary_logs[$i])) {
+                        $recent_log_data =
+                            file_get_contents($dictionary_logs[$i]);
+                        if (!empty($recent_log_data)) {
+                            $recent_log_times[$i] = time();
+                            echo "Process $i:\n----------\n";
+                            echo $recent_log_data;
+                        } else {
+                            echo "Waiting on data from process $i\n";
+                        }
+                        $fp = fopen($dictionary_logs[$i], "w");
+                        fclose($fp);
+                    }
                 }
                 clearstatcache();
-                if (time() > filemtime($next_partition_path) +
-                    5 * C\LOG_TIMEOUT) {
-                    L\crawlLog("DictionaryUpdater seems to have crashed, ".
-                        "exiting ArcTool");
-                    exit();
+                for ($i = 0; $i < $num_forks; $i++) {
+                    if (empty($rebuild_dones[$i]) &&
+                        time() > $recent_log_times[$i] + 5 * C\LOG_TIMEOUT) {
+                        L\crawlLog("DictionaryUpdater seems to have crashed, ".
+                            "exiting ArcTool");
+                        exit();
+                    }
+                }
+                sleep(5);
+            }
+            $finished = true;
+            for ($i = 0; $i < $num_forks; $i++) {
+                $rebuild_status_file = "$bundle_path/rebuild_$i";
+                if (file_exists($rebuild_status_file)) {
+                    $rebuild_dones[$i] = true;
+                } else {
+                    $finished = false;
                 }
-                sleep(15);
             }
-            $next_partition =
-                intval(file_get_contents($next_partition_path));
+            if ($finished) {
+                $next_partition += $num_forks;
+                file_put_contents($next_partition_path, $next_partition);
+                $rebuild_dones = [];
+                for ($i = 0; $i < $num_forks; $i++) {
+                    $rebuild_status_file = "$bundle_path/rebuild_$i";
+                    unlink($rebuild_status_file);
+                    $recent_log_times[$i] = time();
+                }
+            }
+        }
+        if ($force_recompute) {
+            /* when force recompute, we first build inverted indexes for
+              all partition, then the call below is used to remerge them into
+              the global dictionary
+              */
+            $this->rebuildIndexBundle($archive_path, $start_generation, false);
         }
         $index_archive->forceSave();
-        echo "\nIndex Rebuild Complete!\n";
+        echo "\nIndex $rebuild complete!\n";
     }
     /**
      * Used to create an archive_bundle_iterator for a non-yioop archive
@@ -1480,12 +1657,14 @@ php ArcTool.php dict double_index_name which_bundle word start_record num_record
 php ArcTool.php fix-partition bundle_name
 php ArcTool.php fix-partition bundle_name start_partition
 php ArcTool.php fix-partition bundle_name start_partition end_partition
-    /*  recomputes the hash index (.ix) files for a range of partitions
-        from start_partition to end_partition in the documents subfolder of
-        an IndexDocumentBundle. An ix file contains a sequence of compressed
-        4-tuple (doc_id, summary_offset, summary_length, cache_length)
-        corresponding to a partition file (these end in .txt.gz and are
-        a sequence of compressed document summaries followed by orginal
+    /*  recomputes the hash index (index) file in each partition folder
+        (BSOME_NUMBER/PSOME_NUMBER folder)
+        for a range of partitions from start_partition to end_partition in the
+        Documents subfolder of an IndexDocumentBundle. An index file contains
+        a sequence of compressed 4-tuple (doc_id, summary_offset,
+        summary_length, cache_length)
+        corresponding to a archive.txt.gz file (these  are
+        a sequence of compressed document summaries followed by original
         documents).
      */

@@ -1511,10 +1690,15 @@ php ArcTool.php make-filter dict_file filter_file column_num
      */

 php ArcTool.php migrate bundle_name
+php ArcTool.php migrate bundle_name number_of_processes
     /* migrates old Yioop index formats such as FeedArchiveBundle,
        IndexArchiveBundle, and old style DoubleIndexBundle to their
        modern respective equivalents FeedDocumentBundle, IndexDocumentBundle,
-       and modern DoubleIndexBundle
+       and modern DoubleIndexBundle. It can also migrate a version < 4
+       FeedDocumentBundle, IndexDocumentBundle to a current version bundle
+       The default number of processes that are used for index rebuilding is 1,
+       however, rebuilding is faster if you specify a number_of_processes
+       proportional to the number of CPU cores of your machine.
     */

 php ArcTool.php partition bundle_name partition_number
@@ -1527,16 +1711,34 @@ php ArcTool.php partition double_index_name which_bundle partition_number
 php ArcTool.php rebuild bundle_name
 php ArcTool.php rebuild double_index_name which_bundle
 php ArcTool.php rebuild bundle_name continue
+php ArcTool.php rebuild bundle_name continue number_of_processes
 php ArcTool.php rebuild double_index_name which_bundle continue
 php ArcTool.php rebuild bundle_name partition_num
+php ArcTool.php rebuild bundle_name partition_num number_of_processes
 php ArcTool.php rebuild double_index_name which_bundle partition_num
     /*  re-extracts words from summaries files in bundle_name a partition at a
         time, builds an inverted index for that partition and adds to the global
         dictionary. If this process crashes the keyword continue can be used
         to continue from where it left off. If a partition number is supplied
-        process continue from that partition number.
+        process continue from that partition number. The default number of
+        processes that are used for index rebuilding is 1, however,
+        rebuilding is faster if you specify a number_of_processes proportional
+        to the number of CPU cores of your machine.
         */

+php ArcTool.php remerge bundle_name
+php ArcTool.php remerge double_index_name which_bundle
+php ArcTool.php remerge bundle_name continue
+php ArcTool.php remerge double_index_name which_bundle continue
+php ArcTool.php remerge bundle_name partition_num
+php ArcTool.php remerge double_index_name which_bundle partition_num
+    /*  this operates like the previously described rebuild command except
+        if the inverted index files for a partition already exist in that
+        partition they are not recomputed (if they don't exist, they
+        are recomputed), only the merging of them into the
+        global dictionary is computed.
+     */
+
 php ArcTool.php show bundle_name start num
 php ArcTool.php show double_index_name which_bundle start num
     /* outputs items start through num from bundle_name or name of Yioop or
diff --git a/src/executables/DictionaryUpdater.php b/src/executables/DictionaryUpdater.php
index d24724eef..30b7acb55 100644
--- a/src/executables/DictionaryUpdater.php
+++ b/src/executables/DictionaryUpdater.php
@@ -70,8 +70,16 @@ class DictionaryUpdater implements CrawlConstants
      *  naming lock files
      * @param string $bundle_path the path to the IndexDocumentBundle or
      *  FeedDucumentBundle we are adding dictionary info for
+     * @param string $rebuild_or_remerge either the string "rebuild',
+     *  "rebuild_some_number", or "remerge". If it "rebuild", it will both
+     *  recompute partition inverted indexes then build a global dictionary
+     *  from these. If it "remerge", if a partition inverted index
+     *  exists it is directly merged into a new global dictionary without
+     *  recomputing it. "rebuild_some_number" just rebuild partition
+     *  some_number's inverted index, but does not merge it into the
+     *  global dictionary.
      */
-    public static function run($channel, $bundle_path)
+    public static function run($channel, $bundle_path, $rebuild_or_remerge)
     {
         if (!isset($_SERVER["LOG_TO_FILES"])) {
             $_SERVER["LOG_TO_FILES"] = true;
@@ -108,6 +116,7 @@ class DictionaryUpdater implements CrawlConstants
         }
         $next_partition_path = $bundle_path . "/".
             IndexDocumentBundle::NEXT_PARTITION_FILE;
+        $rebuild_path = "$bundle_path/rebuild_$channel";
         if (file_exists($next_partition_path)) {
             $index_archive->next_partition_to_add = intval(
                 file_get_contents($next_partition_path));
@@ -117,7 +126,9 @@ class DictionaryUpdater implements CrawlConstants
                 $index_archive->next_partition_to_add);
         }
         //note the false parameter means will only update for one partition
-        $index_archive->updateDictionary($next_partition_path, false);
+        $index_archive->updateDictionary($next_partition_path, false,
+            $rebuild_or_remerge);
+        file_put_contents($rebuild_path, "done");
     }
     /**
      * Given a folder name, determines the kind of bundle (if any) it holds.
@@ -165,11 +176,12 @@ class DictionaryUpdater implements CrawlConstants
 }

 if (!empty($argv[3]) && in_array($argv[1], ["run", "terminal"])) {
+    $argv[4] ??= "rebuild";
     /*
      * Instantiate and runs the QueueSever
      */
     if ($argv[1] == "terminal") {
         $_SERVER["LOG_TO_FILES"] = "terminal";
     }
-    DictionaryUpdater::run($argv[2], $argv[3]);
+    DictionaryUpdater::run($argv[2], $argv[3], $argv[4]);
 }
diff --git a/src/library/CrawlQueueBundle.php b/src/library/CrawlQueueBundle.php
index a32f7bbe2..da8db9638 100644
--- a/src/library/CrawlQueueBundle.php
+++ b/src/library/CrawlQueueBundle.php
@@ -630,7 +630,8 @@ class CrawlQueueBundle
             $pre_max_folder = $num_sub_dirs;
         }
         $max_folder = min($num_sub_dirs, $pre_max_folder);
-        $last_folder = ($last_folder < $max_folder) ?
+        $last_folder = ($last_folder < $max_folder &&
+            !empty($sub_dirs[$last_folder + 1])) ?
             $last_folder + 1 : 0;
         crawlLog("Tier chosen $last_folder, Max Tier Choice $max_folder, ".
             " Highest Tier $num_sub_dirs, Exp Counter $exp_max_folder");
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index 6098f55ce..1235b4d14 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -54,7 +54,7 @@ class IndexDocumentBundle implements CrawlConstants
      * The version of this IndexDocumentBundle. The lowest format number is
      * 3.0 as prior inverted index/document stores used IndexArchiveBundle's
      */
-    const DEFAULT_VERSION = "3.3";
+    const DEFAULT_VERSION = "4.0";
     /**
      * Default values for the configuration parameters of an
      * IndexDocumentBundle
@@ -264,13 +264,10 @@ class IndexDocumentBundle implements CrawlConstants
      *  IndexDocumentBundle
      * @param int $num_docs_per_partition the number of documents to be stored
      *  in a single partition
-     * @param int $max_keys the maximum number of keys used by the BPlusTree
-     *  used for the inverted index
      */
     public function __construct($dir_name, $read_only_archive = true,
         $description = null, $num_docs_per_partition =
-        C\NUM_DOCS_PER_PARTITION, $max_keys = BPlusTree::MAX_KEYS,
-        $overflow_threshold = C\OVERFLOW_THRESHOLD)
+        C\NUM_DOCS_PER_PARTITION, $overflow_threshold = C\OVERFLOW_THRESHOLD)
     {
         $this->dir_name = $dir_name;
         $is_dir = is_dir($this->dir_name);
@@ -290,24 +287,12 @@ class IndexDocumentBundle implements CrawlConstants
                 $archive_info_path));
             $just_got_info = true;
         }
-        if ($this->archive_info['VERSION'] < "3.1") {
-            $this->archive_info['RECORD_COMPRESSOR'] =
-                C\NS_COMPRESSORS . "GzipCompressor";
-        }
         $this->archive_info['RECORD_COMPRESSOR'] ??=
             C\NS_COMPRESSORS . "NonCompressor";
         $record_compressor = $this->archive_info['RECORD_COMPRESSOR'];
         $this->archive_info['BLOB_COMPRESSOR'] ??=
             C\NS_COMPRESSORS . "GzipCompressor";
         $blob_compressor = $this->archive_info['BLOB_COMPRESSOR'];
-        if ($just_got_info &&
-            empty($this->archive_info['BPLUS_BLOB_COMPRESSOR'])) {
-            $this->archive_info['BPLUS_BLOB_COMPRESSOR'] =
-                C\NS_COMPRESSORS . "GzipCompressor";
-        }
-        $this->archive_info['BPLUS_BLOB_COMPRESSOR'] ??=
-            C\NS_COMPRESSORS . "NonCompressor";
-        $bplus_blob_compressor = $this->archive_info['BPLUS_BLOB_COMPRESSOR'];
         if (!$read_only_archive && !$just_got_info) {
             file_put_contents($archive_info_path,
                 serialize($this->archive_info));
@@ -365,20 +350,11 @@ class IndexDocumentBundle implements CrawlConstants
         if (!$read_only_archive) {
             $this->documents->initCountIfNotExists("VISITED_URLS_COUNT");
         }
-        if ($this->archive_info['VERSION'] < "3.2") {
-            $this->dictionary = new BPlusTree($this->dir_name . "/" .
-                self::DICTIONARY_FOLDER, ["PRIMARY KEY" => ["TERM", 16],
-                "PARTITION" => "INT", "NUM_DOCS" => "INT",
-                "NUM_OCCURRENCES" => "INT", "POSTINGS" => "BLOB"], $max_keys,
-                $record_compressor, $bplus_blob_compressor);
-        } else {
-            $this->dictionary = new BPlusTree($this->dir_name . "/" .
-                self::DICTIONARY_FOLDER, ["PRIMARY KEY" => ["TERM", 16],
-                "PARTITION" => "INT", "NUM_DOCS" => "INT",
-                "NUM_OCCURRENCES" => "INT", "POSTINGS_OFFSET" => "INT",
-                "POSTINGS_LEN" => "INT"], $max_keys,
-                $record_compressor, $bplus_blob_compressor);
-        }
+        $this->dictionary = new LSMTree($this->dir_name . "/" .
+            self::DICTIONARY_FOLDER, ["PRIMARY KEY" => ["TERM", 16],
+            "PARTITION" => "INT", "NUM_DOCS" => "INT",
+            "NUM_OCCURRENCES" => "INT", "POSTINGS_OFFSET" => "INT",
+            "POSTINGS_LEN" => "INT"]);
     }
     /**
      * Add the array of $pages to the documents PartitionDocumentBundle
@@ -399,7 +375,7 @@ class IndexDocumentBundle implements CrawlConstants
     }
     /**
      * For every partition between next partition and save partition, adds
-     * the posting list information to the dictionary BPlusTree. At the
+     * the posting list information to the dictionary LSMTree. At the
      * end of this process next partition and save partition should be the same
      *
      * @param string $taking_too_long_touch a filename of a file to touch
@@ -408,17 +384,33 @@ class IndexDocumentBundle implements CrawlConstants
      *  file to prevent  Yioop's web interface from stopping the crawl because
      *  it has seen no recent  progress activity on a crawl.
      * @param bool $till_equal is set to true will keep adding each partition
-     *  up till the save partition if set to false, oln;y adds one partition
+     *  up till the save partition if set to false, only adds one partition
+     * @param string $rebuild_or_remerge either the string "rebuild',
+     *  "rebuild_some_number", or "remerge". If it "rebuild", it will both
+     *  recompute partition inverted indexes then build a global dictionary
+     *  from these. If it "remerge", if a partition inverted index
+     *  exists it is directly merged into a new global dictionary without
+     *  recomputing it. "rebuild_some_number" just rebuild partition
+     *  some_number's inverted index, but does not merge it into the
+     *  global dictionary.
      */
     public function updateDictionary($taking_too_long_touch = null,
-        $till_equal = true)
+        $till_equal = true, $rebuild_or_remerge = "rebuild")
     {
-        $next_partition = $this->next_partition_to_add;
+        if (preg_match("/rebuild\_(\d+)/", $rebuild_or_remerge, $matches)) {
+            $next_partition = intval($matches[1]);
+            $rebuild_or_remerge = "partition_only";
+            crawlLog("Rebuilding just partition $next_partition!");
+        } else {
+            $next_partition = $this->next_partition_to_add;
+        }
         $save_partition = $this->documents->parameters["SAVE_PARTITION"];
         $current_num_docs = $this->documents->parameters['ACTIVE_COUNT'];
         $max_items_per_partition =
             $this->documents->parameters['MAX_ITEMS_PER_FILE'];
-        crawlLog("Current save partition has $current_num_docs documents.");
+        if ($rebuild_or_remerge == "rebuild") {
+            crawlLog("Current save partition has $current_num_docs documents.");
+        }
         crawlLog("Max documents per partition: $max_items_per_partition.");
         $memory_limit = metricToInt(ini_get("memory_limit"));
         $before_usage = memory_get_usage();
@@ -427,15 +419,36 @@ class IndexDocumentBundle implements CrawlConstants
         $advanced_partition = false;
         while ($next_partition < $save_partition && $advanced_partition <=
             $till_equal) {
-            crawlLog("Indexer adding Partition to dictionary...");
-            crawlLog("...because save partition changed");
+            if ($rebuild_or_remerge == "rebuild") {
+                crawlLog("Indexer adding Partition to dictionary...");
+                crawlLog("...because save partition changed");
+            }
             $switch_time = microtime(true);
             // Save current shard dictionary to main dictionary
-            $this->buildInvertedIndexPartition($next_partition,
-                $taking_too_long_touch);
+            $have_inverted_index_files = false;
+            if ($rebuild_or_remerge == "remerge") {
+                $base_folder = $this->getPartitionBaseFolder($next_partition);
+                $have_inverted_index_files = true;
+                foreach (self::PARTITION_FILENAMES as $filename) {
+                    $component_filename = $base_folder . "/" . $filename;
+                    if (!file_exists($component_filename)) {
+                        $have_inverted_index_files = false;
+                        break;
+                    }
+                }
+            }
+            if ($have_inverted_index_files) {
+                crawlLog("...Partition $next_partition has all of its inverted".
+                    " index files, not recomputing, just merging.");
+            } else {
+                $this->buildInvertedIndexPartition($next_partition,
+                    $taking_too_long_touch);
+            }
             $num_freed = garbageCollect();
-            $this->addPartitionPostingsDictionary(
-                $next_partition, $taking_too_long_touch);
+            if ($rebuild_or_remerge != "partition_only") {
+                $this->addPartitionPostingsDictionary(
+                    $next_partition, $taking_too_long_touch);
+            }
             crawlLog("Indexer force running garbage collector after partition".
                  " advance. This freed $num_freed bytes.");
             $after_usage = memory_get_usage();
@@ -504,6 +517,8 @@ class IndexDocumentBundle implements CrawlConstants
         $this->last_entries = $last_entries_tools->load($last_entries_filename);
         $num_postings = substr_count($postings_string, "\xFF") + 1;
         $last_marker = 0;
+        $slot = ($dictionary->occupiedTier(0)) ? "B" : "A";
+        $dictionary->selectPutSlot($slot);
         for ($i = 0; $i < $num_postings; $i++) {
             $cur_marker = strpos($postings_string, "\xFF", $last_marker);
             $diff = ($cur_marker === false) ? null :
@@ -537,7 +552,8 @@ class IndexDocumentBundle implements CrawlConstants
                 "POSTINGS_OFFSET" => $postings_offset,
                 "POSTINGS_LEN" => $postings_len]);
         }
-        $dictionary->flushLastPutNode();
+        $dictionary->flush();
+        $dictionary->mergeTiers();
         crawlLog("...Finished Adding Partition Posting Info to " .
             "Dictionary: " . changeInMicrotime($start_time));
     }
@@ -607,13 +623,15 @@ class IndexDocumentBundle implements CrawlConstants
         /* set up $doc_map_filename, $postings_filename, $postings_filename,
            $positions_filename, etc
          */
-        foreach (self::PARTITION_FILENAMES as $filename) {
-            $component_filename = $base_folder . "/" . $filename;
-            if (file_exists($component_filename)) {
-                unlink($component_filename);
+        if (!$just_stats) {
+            foreach (self::PARTITION_FILENAMES as $filename) {
+                $component_filename = $base_folder . "/" . $filename;
+                if (file_exists($component_filename)) {
+                    unlink($component_filename);
+                }
+                $component = $filename . "_filename";
+                $$component = $component_filename;
             }
-            $component = $filename . "_filename";
-            $$component = $component_filename;
         }
         $doc_map_tools = $this->doc_map_tools;
         $postings_tools = $this->postings_tools;
@@ -1430,9 +1448,9 @@ class IndexDocumentBundle implements CrawlConstants
         if (!$dictionary) {
             return [];
         }
-        $result = $dictionary->get($term_id, true, true, true, false, $offset,
-            $num_partitions);
-        if (empty($result)) {
+        $result = ["ROWS" =>
+            $dictionary->get($term_id, $offset, $num_partitions)];
+        if (empty($result["ROWS"])) {
             $result = [];
         }
         $max_found_partition = 0;
diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php
index 3da26a4ab..03091fca5 100644
--- a/src/library/IndexManager.php
+++ b/src/library/IndexManager.php
@@ -71,6 +71,10 @@ class IndexManager implements CrawlConstants
      * Max number of URLs to be cached for most recent version of a page lookup
      */
     const URLS_CACHE_SIZE = 1000;
+    /**
+     * Max number of Word Info items that can be cached
+     */
+    const INFO_CACHE_SIZE = 1000;
     /**
      * Returns a reference to the managed copy of an IndexArchiveBundle object
      * with a given timestamp or feed (for handling media feeds)
@@ -271,13 +275,27 @@ class IndexManager implements CrawlConstants
         $start_generation = -1, $num_distinct_generations = -1,
         $with_remaining_total = false)
     {
+        static $info_cache = [];
+        $lookup_hash = crawlHash($index_name . $term_id . $threshold .
+            $start_generation . $num_distinct_generations .
+            $with_remaining_total);
+        if (isset($info_cache[$lookup_hash])) {
+            $tmp = $info_cache[$lookup_hash];
+            unset($info_cache[$lookup_hash]);
+            $info_cache[$lookup_hash] = $tmp;
+            return $tmp;
+        }
         $index = self::getIndex($index_name);
         $pre_info = [];
         if (!empty($index) && method_exists($index, "getWordInfo")) {
             $start_generation = ($start_generation < 0) ? 0 : $start_generation;
-            return $index->getWordInfo($term_id, $threshold,
-                $start_generation, $num_distinct_generations,
+            $info_cache[$lookup_hash] = $index->getWordInfo($term_id,
+                $threshold, $start_generation, $num_distinct_generations,
                 $with_remaining_total);
+            if (count($info_cache) >= self::INFO_CACHE_SIZE) {
+                array_shift($info_cache);
+            }
+            return $info_cache[$lookup_hash];
         } else if (!empty($index->dictionary)) {
             $pre_info =
                 $index->dictionary->getWordInfo($term_id, true, $threshold,
@@ -324,7 +342,12 @@ class IndexManager implements CrawlConstants
             $total = 0;
             $info = [];
         }
-        return ($with_remaining_total) ? [$total, $info] : $info;
+        $info_cache[$lookup_hash] = ($with_remaining_total) ?
+            [$total, $info] : $info;
+        if (count($info_cache) > self::INFO_CACHE_SIZE) {
+            array_shift($info_cache);
+        }
+        return $info_cache[$lookup_hash];
     }
     /**
      * Finds posting info related to the most recent version
diff --git a/src/library/LSMTree.php b/src/library/LSMTree.php
new file mode 100644
index 000000000..639ad42f5
--- /dev/null
+++ b/src/library/LSMTree.php
@@ -0,0 +1,734 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2024  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license https://www.gnu.org/licenses/ GPL3
+ * @link https://www.seekquarry.com/
+ * @copyright 2009 - 2024
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+/**
+ * This class implements a Log Structured merge tree data structure suitable for
+ * storing and retrieving sortable records
+ *
+ * @author Chris Pollett
+ */
+class LSMTree
+{
+    /**
+     *
+     */
+    const BLOCK_FACTOR = 10000;
+    /**
+     *
+     */
+    const RECORD_CACHE_SIZE = 100;
+    /**
+     *
+     */
+    const DATA_FILE_PREFIX = "D";
+    /**
+     *
+     */
+    const FOLDER_PREFIX = "F";
+    /**
+     *
+     */
+    const INDEX_FILE = "index.txt";
+    /**
+     *
+     */
+    const MAX_FILE_SIZE = 32768;
+    /**
+     *
+     */
+    const MAX_TIER_FILENAME = "max_tier.txt";
+    /**
+     *
+     */
+    const TIER_PREFIX = "Tier";
+    /**
+     * @var string
+     */
+    public $base_slot = "A";
+    /**
+     * @var string
+     */
+    public $block_factor;
+    /**
+     * Folder for storing the LSMTree files
+     * @var string
+     */
+    public $folder;
+    /**
+     * @var int
+     */
+    public $max_file_size;
+    /**
+     * @var Tier
+     */
+    public $put_slot = null;
+    /**
+     *
+     */
+    public $table_tools;
+    /**
+     * The highest tiered index in the LSMTree
+     * @var int
+     */
+    public static $max_tier;
+    /**
+     * Creates/Loads LSM-Tree having specified folder and minimum_degree.
+     *
+     * @param string $folder is the folder for storing the B+-Tree files
+     * @param array $format the column names, keys and types for this
+     *  LSTM  object
+     * @param int $max_file_size
+     */
+    public function __construct($folder, $format, $max_file_size =
+        self::MAX_FILE_SIZE, $block_factor = self::BLOCK_FACTOR)
+    {
+        if (empty($format['PRIMARY KEY'][1]) || $format['PRIMARY KEY'][1] < 0) {
+            throw new \Exception(
+                "LSMTree class requires fixed lengthed keys");
+        }
+        $this->folder = $folder;
+        if (!file_exists($folder)) {
+            mkdir($folder);
+            chmod($folder, 0777);
+        }
+        $this->getMaxTier(); //computes max tier if not set yet
+        $this->table_tools = new PackedTableTools($format);
+        $this->max_file_size = $max_file_size;
+        $this->block_factor = $block_factor;
+    }
+    /**
+     *
+     */
+    public function getTierFolder($tier)
+    {
+        return $this->folder . "/" . self::TIER_PREFIX .
+            sprintf("%'.04d", $tier);
+    }
+    /**
+     *
+     */
+    public function occupiedTier($tier)
+    {
+        return file_exists($this->getTierFolder($tier) . "/A");
+    }
+    /**
+     *
+     */
+    public function getMaxTier($recompute = false)
+    {
+        if (!$recompute && isset(self::$max_tier)) {
+            return self::$max_tier;
+        }
+        $max_path = $this->folder . "/" . self::MAX_TIER_FILENAME;
+        if (!$recompute && file_exists($max_path)) {
+            return self::$max_tier = intval(file_get_contents($max_path));
+        }
+        $tier_path = $this->folder . "/" . self::TIER_PREFIX;
+        $tiers = glob($tier_path . "*");
+        sort($tiers);
+        $last_tier = (!empty($tiers) && count($tiers) > 0) ?
+            $tiers[count($tiers) - 1] : $this->folder . "/" .
+            self::TIER_PREFIX . "0";
+        $max_tier = intval(substr($last_tier,
+            strlen($tier_path)));
+        file_put_contents($max_path, $max_tier);
+        return self::$max_tier = $max_tier;
+    }
+    /**
+     *
+     */
+    public function selectPutSlot($letter)
+    {
+        $this->base_slot = ($letter == "A") ? "A" : "B";
+        $this->put_slot = null;
+    }
+    /**
+     *
+     */
+    public function put($row)
+    {
+        if (empty($this->put_slot)) {
+            $put_folder = $this->getTierFolder(0) . "/" . $this->base_slot;
+            $this->put_slot = new Tier($put_folder, $this->table_tools, "w",
+                $this->max_file_size, $this->block_factor);
+        }
+        $this->put_slot->put($row , false);
+    }
+    /**
+     * Flushes any in-memory data into the active slot at tier 0.
+     */
+    public function flush()
+    {
+        if (!empty($this->put_slot)) {
+            $this->put_slot->flush();
+        }
+    }
+    /**
+     * Merges any tiers with both slots filled in the LSTM into
+     * a tier in a slot one level higher
+     */
+    public function mergeTiers()
+    {
+        crawlLog("..Begin LSMTiers Merging Tiers..");
+        $max_tier = $this->getMaxTier();
+        crawlLog("....begin Max Tier is: $max_tier");
+        for ($i = 0; $i <= $max_tier; $i++)
+        {
+            $tier_folder = $this->getTierFolder($i);
+            if (file_exists($tier_folder . "/B")) {
+                crawlLog("....Merging Tier $i");
+                $this->mergeTier($i);
+            } else {
+                crawlLog("....Tier $i doesn't need to be merged.");
+            }
+        }
+        $max_tier = $this->getMaxTier(true);
+        crawlLog("....end Max Tier is: $max_tier");
+        crawlLog("..End LSMTiers Merging Tiers..");
+    }
+    /**
+     *
+     */
+    public function emptyTier($tier)
+    {
+        $tier_folder = $this->getTierFolder($tier);
+        $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager";
+        $db = new $db_class();
+        $db->unlinkRecursive($tier_folder, true);
+    }
+    /**
+     *
+     */
+    public function compare($entry_a, $entry_b)
+    {
+        $key_len = $this->table_tools->key_len;
+        return strncmp($entry_a, $entry_b, $key_len);
+    }
+    /**
+     *
+     */
+    public function mergeEntries($entry_a, $entry_b)
+    {
+        $table_tools = $this->table_tools;
+        list($encoded_key, $values_a) =
+            entryToKeyValues($entry_a, $table_tools);
+        list(, $values_b)  = entryToKeyValues($entry_b, $table_tools);
+        $out_values = $table_tools->mergeRowValues($values_a, $values_b);
+        return $encoded_key . $out_values;
+    }
+    /**
+     *
+     */
+    public function mergeTier($tier)
+    {
+        $tier_folder = $this->getTierFolder($tier);
+        if (!file_exists($tier_folder . "/B")) {
+            return;
+        }
+        if (!file_exists($tier_folder . "/A")) {
+            rename($tier_folder . "/B", $tier_folder . "/A");
+            return;
+        }
+        $a_tier = new Tier($tier_folder . "/A", $this->table_tools);
+        $b_tier = new Tier($tier_folder . "/B", $this->table_tools);
+        $next_tier_folder = $this->getTierFolder($tier + 1);
+        $target_folder = ($this->occupiedTier($tier + 1)) ?
+            $next_tier_folder . "/B" : $next_tier_folder . "/A";
+        $target_tier = new Tier($target_folder, $this->table_tools, "w",
+            $this->max_file_size, $this->block_factor);
+        $a_row = $a_tier->next();
+        $b_row = $b_tier->next();
+        $cnt = 0;
+        while(!empty($a_row) || !empty($b_row)) {
+            if (empty($b_row) && !empty($a_row)) {
+                $target_tier->put($a_row);
+                $a_row = $a_tier->next();
+            } else if (empty($a_row) && !empty($b_row)) {
+                $target_tier->put($b_row);
+                $b_row = $b_tier->next();
+            } else if (!empty($a_row) && !empty($b_row)) {
+                $cmp = $this->compare($a_row, $b_row);
+                if ($cmp < 0) {
+                    $target_tier->put($a_row);
+                    $a_row = $a_tier->next();
+                } else if ($cmp > 0) {
+                    $target_tier->put($b_row);
+                    $b_row = $b_tier->next();
+                } else {
+                    $target_tier->put($this->mergeEntries($a_row, $b_row));
+                    $a_row = $a_tier->next();
+                    $b_row = $b_tier->next();
+                }
+            }
+            $cnt++;
+            crawlTimeoutLog("......have merged $cnt items of Tier $tier.");
+        }
+        $target_tier->flush();
+        $this->emptyTier($tier);
+    }
+    /**
+     * Returns the record associated with a $key as stored in the LSMTree.
+     * If $key is not present in tree then returns null
+     * @param string $key to look up in current LSMTree
+     * @param int $offset starting record item associated with this key
+     *      to return
+     * @param int $limit maximum number of record items associated with this
+     *      key to return, null == till end
+     * @return array of items associated with this key in the LSMTree
+     */
+    public function get($key, $offset = 0, $limit = null)
+    {
+        $max_tier = $this->getMaxTier();
+        $rows = [];
+        $max_rows = $offset + ($limit ?? 0);
+        for ($i = $max_tier; $i >= 0; $i--)
+        {
+            $add_rows = $this->getTier($i, $key);
+            if (is_array($add_rows)) {
+                $rows += $add_rows;
+            }
+            if ($limit > 0 && count($rows) > $max_rows) {
+                break;
+            }
+        }
+        if ($offset > 0 || $limit > 0) {
+            $rows = array_slice($rows, $offset, $limit);
+        }
+        return $rows;
+    }
+    /**
+     *
+     */
+    public function getTier($tier, $key)
+    {
+        $slot_folder = $this->getTierFolder($tier) . "/A";
+        if (!file_exists($slot_folder)) {
+            return null;
+        }
+        $slot = new Tier($slot_folder, $this->table_tools);
+        return $slot->get($key);
+    }
+    /**
+     * Save the operating parameters of this LSMTree
+     */
+    public function saveParameters()
+    {
+        $parameter_path = $this->folder . "/" . self::PARAMETERS_FILE;
+        file_put_contents($parameter_path, serialize($this->parameters),
+            LOCK_EX);
+    }
+    /**
+     * Returns the parameters (such as its signature, max keys per nodes, etc)
+     * used to configure the LSMTree stored at $folder
+     *
+     * @param string $folder file path to a stored LSMTree
+     * @return array configuration info about the LSMTree
+     */
+    public static function getParameterInfo($folder)
+    {
+        $parameter_path = $folder . "/" . self::PARAMETERS_FILE;
+        if(file_exists($parameter_path)) {
+            $parameters = unserialize(file_get_contents($parameter_path)) ?? [];
+            return $parameters;
+        } else {
+            return [];
+        }
+    }
+}
+/**
+ *
+ */
+function entryToKeyValues($entry, $table_tools, $decode_key = false)
+{
+    $key_len = $table_tools->key_len;
+    $key = substr($entry, 0, $key_len);
+    $values = substr($entry, $key_len);
+    return [$key, $values];
+}
+/**
+ *
+ */
+class Tier
+{
+    /**
+     *
+     */
+    public $block_factor;
+    /**
+     *
+     */
+    public $first_active_key;
+    /**
+     *
+     */
+    public $folder;
+    /**
+     *
+     */
+    public $iterator_folder_index;
+    /**
+     *
+     */
+    public $iterator_folders;
+    /**
+     *
+     */
+    public $iterator_file_index;
+    /**
+     *
+     */
+    public $iterator_files;
+    /**
+     *
+     */
+    public $iterator_entry_index;
+    /**
+     *
+     */
+    public $iterator_entries;
+    /**
+     *
+     */
+    public $max_file_size;
+    /**
+     *
+     */
+    public $mode;
+    /**
+     *
+     */
+    public $table_tools;
+    /**
+     *
+     */
+    private $records;
+    /**
+     *
+     */
+    private $active_filename;
+    /**
+     *
+     */
+    private static $cache = [];
+    /**
+     *
+     */
+    public function __construct($folder, $table_tools, $mode = "r",
+        $max_file_size = LSMTree::MAX_FILE_SIZE,
+        $block_factor = LSMTree::BLOCK_FACTOR)
+    {
+        $this->folder = $folder;
+        $this->table_tools = $table_tools;
+        $this->mode = $mode;
+        $this->max_file_size = $max_file_size;
+        $this->block_factor = $block_factor;
+        $folder_exists = file_exists($folder);
+        if (!$folder_exists) {
+            if ($mode == "w") {
+                makePath($folder);
+            } else {
+                return null;
+            }
+        }
+        if ($mode == "w") {
+            $this->records = "";
+        }
+        $this->reset();
+    }
+    /**
+     *
+     */
+    public function flush()
+    {
+        if ($this->mode != "w" || empty($this->first_active_key)) {
+            return false;
+        }
+        if (empty($this->records)) {
+            return true;
+        }
+        $old_cwd = getcwd();
+        chdir($this->folder);
+        $tier_folders = glob(LSMTree::FOLDER_PREFIX . "*");
+        $tiers_changed = false;
+        if (empty($tier_folders)) {
+            $active_folder = LSMTree::FOLDER_PREFIX .
+                rawurlencode($this->first_active_key);
+            mkdir($active_folder);
+            chmod($active_folder, 0777);
+            $tier_folders = [$active_folder];
+            $tiers_changed = true;
+            $data_files = [];
+        } else {
+            $last_folder = $tier_folders[count($tier_folders) - 1];
+            chdir($last_folder);
+            $data_files = glob(LSMTree::DATA_FILE_PREFIX . "*");
+            if (empty($data_files)) {
+                $data_files = [];
+            }
+            chdir($this->folder);
+            $num_data_files = count($data_files);
+            if ($num_data_files >= $this->block_factor) {
+                $active_folder = LSMTree::FOLDER_PREFIX .
+                    rawurlencode($this->first_active_key);
+                mkdir($active_folder);
+                $data_files = [];
+                $tier_folders[] = $active_folder;
+                $tiers_changed = true;
+            } else {
+                $active_folder = $last_folder;
+            }
+        }
+        chdir($old_cwd);
+        $active_path = $this->folder . "/" . $active_folder;
+        $data_file = LSMTree::DATA_FILE_PREFIX . rawurlencode(
+            $this->first_active_key);
+        $data_files[] = $data_file;
+        file_put_contents("$active_path/$data_file", $this->records);
+        $this->writeRecords("$active_path/" . LSMTree::INDEX_FILE, $data_files);
+        if ($tiers_changed) {
+            $this->writeRecords($this->folder . "/" . LSMTree::INDEX_FILE,
+                $tier_folders);
+        }
+        $this->records = "";
+        return true;
+    }
+    /**
+     *
+     */
+    public function get($key)
+    {
+        $tier_folders = $this->readRecords(
+            $this->folder . "/" . LSMTree::INDEX_FILE);
+        $url_encoded_key = rawurlencode($key);
+        $key_folder = $this->binarySearch(LSMTree::FOLDER_PREFIX .
+            $url_encoded_key, $tier_folders);
+        if (!$key_folder) {
+            return false;
+        }
+        $key_path = $this->folder . "/$key_folder";
+        $data_files = $this->readRecords("$key_path/" . LSMTree::INDEX_FILE);
+        $data_file = $this->binarySearch(LSMTree::DATA_FILE_PREFIX .
+            $url_encoded_key, $data_files);
+        if (!$data_file) {
+            return false;
+        }
+        $table_tools = $this->table_tools;
+        $key_len = $table_tools->key_len;
+        $record_string = file_get_contents("$key_path/$data_file");
+        $start = strpos($record_string, "\xFF$key");
+        if ($start) {
+            $start++;
+        } else {
+            if (strncmp($record_string, $key, $key_len) != 0) {
+                return false;
+            }
+            $start = 0;
+        }
+        $end = strpos($record_string, "\xFF", $start);
+        $end = ($end > 0) ? $end : strlen($record_string);
+        $length = $end - $start;
+        $record = decode255(substr($record_string, $start, $end));
+        $values = substr($record, $key_len);
+        return $table_tools->unpack($values);
+    }
+    /**
+     *
+     */
+    public function put($row, $is_packed = true)
+    {
+        if($this->mode != "w") {
+            return false;
+        }
+        $table_tools = $this->table_tools;
+        if (!$is_packed) {
+            $key = $row[$table_tools->key_field];
+            $packed_row = $table_tools->pack($row);
+            $entry = $key . $packed_row;
+        } else {
+            $entry = $row;
+        }
+        $encoded_entry = encode255($entry);
+        if (strlen($this->records) + strlen($encoded_entry) + 1 >
+            $this->max_file_size) {
+            $this->flush();
+        }
+        if (empty($this->records)) {
+            list($this->first_active_key,) = entryToKeyValues($entry,
+                $table_tools, true);
+        }
+        $separator = (strlen($this->records) > 0) ? "\xFF" : "";
+        $this->records .= $separator . $encoded_entry;
+        return true;
+    }
+    /**
+     *
+     */
+    public function binarySearch($needle, $haystack)
+    {
+        $low = 0;
+        $high = count($haystack) - 1;
+        if ($high < 0 || strcmp($needle, $haystack[$low]) < 0) {
+            return false;
+        }
+        if (strcmp($needle, $haystack[$high]) >= 0) {
+            return $haystack[$high];
+        }
+        while ($high - $low > 1) {
+            $mid = ($high + $low) >> 1;
+            $cmp = strcmp($needle, $haystack[$mid]);
+            if ($cmp == 0) {
+                return $haystack[$mid];
+            } else if ($cmp < 0) {
+                $high = $mid;
+            } else {
+                $low = $mid;
+            }
+        }
+        return $haystack[$low];
+    }
+    /**
+     *
+     */
+    public function firstEntry()
+    {
+        $folder = $this->folder;
+        $this->iterator_folders = $this->readRecords("$folder/" .
+            LSMTree::INDEX_FILE);
+        if (empty($this->iterator_folders) ||
+            !is_array($this->iterator_folders)) {
+            return false;
+        }
+        $this->iterator_folder_index = 0;
+        $iterator_folder = $this->iterator_folders[0];
+        $file_path = "$folder/$iterator_folder";
+        $this->iterator_files = $this->readRecords( "$file_path/" .
+            LSMTree::INDEX_FILE);
+        if (empty($this->iterator_files) ||
+            !is_array($this->iterator_files)) {
+            return false;
+        }
+        $this->iterator_file_index = 0;
+        $iterator_file = $this->iterator_files[0];
+        $this->iterator_entries = $this->readRecords(
+            "$file_path/{$iterator_file}", "\xFF");
+        if (empty($this->iterator_entries) ||
+            !is_array($this->iterator_entries)) {
+            return false;
+        }
+        $this->iterator_entry_index = 0;
+        return decode255($this->iterator_entries[0]);
+    }
+    /**
+     *
+     */
+    public function next()
+    {
+        if (empty($this->iterator_folders)) {
+            return $this->firstEntry();
+        }
+        $this->iterator_entry_index++;
+        $entry = $this->iterator_entries[$this->iterator_entry_index] ?? false;
+        if ($entry) {
+            return decode255($entry);
+        }
+        $folder = $this->folder;
+        $this->iterator_entry_index = 0;
+        $this->iterator_file_index++;
+        $file = $this->iterator_files[$this->iterator_file_index] ?? false;
+        if ($file) {
+            $iterator_folder = $this->iterator_folders[
+                $this->iterator_folder_index];
+            $file_path = "$folder/$iterator_folder/$file";
+            $this->iterator_entries = $this->readRecords($file_path, "\xFF");
+            return decode255($this->iterator_entries[0]) ?? false;
+        }
+        $this->iterator_file_index = 0;
+        $this->iterator_folder_index++;
+        $iterator_folder = $this->iterator_folders[
+            $this->iterator_folder_index] ?? false;
+        if (!$iterator_folder) {
+            return false;
+        }
+        $folder_path = "$folder/$iterator_folder";
+        $this->iterator_files = $this->readRecords("$folder_path/" .
+            LSMTree::INDEX_FILE);
+        if (empty($this->iterator_files) ||
+            !is_array($this->iterator_files)) {
+            return false;
+        }
+        $file = $this->iterator_files[0];
+        $file_path = "$folder_path/$file";
+        $this->iterator_entries = $this->readRecords($file_path, "\xFF");
+        return decode255($this->iterator_entries[0]) ?? false;
+    }
+    /**
+     *
+     */
+    public function reset()
+    {
+        $this->iterator_folder_index = 0;
+        $this->iterator_folders = [];
+        $this->iterator_file_index = 0;
+        $this->iterator_files = [];
+        $this->iterator_entry_index = 0;
+        $this->iterator_entries = [];
+    }
+    /**
+     *
+     */
+    function writeRecords($filename, $lines, $delimiter = "\n")
+    {
+        file_put_contents($filename, implode($delimiter, $lines));
+        $name_hash = crawlHash($filename);
+        unset(self::$cache[$name_hash]);
+    }
+    /**
+     *
+     */
+    function readRecords($filename, $delimiter = "\n")
+    {
+        $name_hash = crawlHash($filename);
+        if (isset(self::$cache[$name_hash])) {
+            $tmp = self::$cache[$name_hash];
+            unset(self::$cache[$name_hash]);
+            self::$cache[$name_hash] = $tmp; //move to end of array
+            return $tmp;
+        }
+        self::$cache[$name_hash] =
+            explode($delimiter, file_get_contents($filename));
+        if (count(self::$cache[$name_hash]) >= LSMTRee::RECORD_CACHE_SIZE) {
+            array_shift(self::$cache);
+        }
+        return self::$cache[$name_hash];
+    }
+}
diff --git a/src/library/PackedTableTools.php b/src/library/PackedTableTools.php
index 06f79ed7b..48fe14ff9 100644
--- a/src/library/PackedTableTools.php
+++ b/src/library/PackedTableTools.php
@@ -216,7 +216,7 @@ class PackedTableTools
      *  version of such a table, or a file handle to the end of such a file
      * @param string $key a key string of length given by the signature of this
      *      PackedTableTools
-     * @param string $table_row a record packed according tot the signature
+     * @param string $table_row a record packed according to the signature
      *      of this PackedTableTools
      * @param int $add_method one of self::ADD_MEM_TABLE, self::ADD_FILE_HANDLE,
      *      self::ADD_FILE_PATH indicating which of the three possibilities
diff --git a/src/library/PartitionDocumentBundle.php b/src/library/PartitionDocumentBundle.php
index 27cbd0a95..1e360d5ae 100644
--- a/src/library/PartitionDocumentBundle.php
+++ b/src/library/PartitionDocumentBundle.php
@@ -406,7 +406,10 @@ class PartitionDocumentBundle
         return $value;
     }
     /**
-     *
+     * Get the file path in the LSMTree for the block folder the partition $i
+     * should be in
+     * @param int $i a partition number to find the block path for
+     * @return int file path for block folder
      */
     public function getPartitionBlock($i)
     {
@@ -417,7 +420,10 @@ class PartitionDocumentBundle
         return $folder . "/". self::BLOCK_PREFIX . "$block";
     }
     /**
+     * Get the file path in the LSMTree for the partition $i folder
      *
+     * @param int $i a partition number to find the partition folder for
+     * @return int file path forpartition folder
      */
     public function getPartitionFolder($i)
     {
@@ -458,7 +464,7 @@ class PartitionDocumentBundle
      *  use a cached value if present
      * @param int $mode PackedTableTools mode to use when reading in partition
      * @return mixed either a string if $mode as AS_STRING_MODE, or
-            array $key => packed records pairs where records are
+     *    array $key => packed records pairs where records are
      *  packed according to this PartitionDocumentBundle's signature
      */
     public function loadPartitionIndex($partition, $force_load = false,
diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php
index 90dab02b6..da752e575 100644
--- a/src/library/index_bundle_iterators/WordIterator.php
+++ b/src/library/index_bundle_iterators/WordIterator.php
@@ -481,9 +481,13 @@ class WordIterator extends IndexBundleIterator
     {
         $positions_filename = $base_folder . "/" .
             IndexDocumentBundle::POSITIONS_FILENAME;
-        $fh = (file_exists($positions_filename)) ?
-            fopen($positions_filename, "r") : false;
-        $file_size = filesize($positions_filename);
+        if (file_exists($positions_filename)) {
+            $fh = fopen($positions_filename, "r");
+            $file_size = filesize($positions_filename);
+        } else {
+            $fh = false;
+            $file_size = 0;
+        }
         return [$fh, $file_size];
     }
     /**
diff --git a/tests/BPlusTreeTest.php b/tests/BPlusTreeTest.php
index c8ed999ce..11807a4ca 100644
--- a/tests/BPlusTreeTest.php
+++ b/tests/BPlusTreeTest.php
@@ -22,8 +22,7 @@
  *
  * END LICENSE
  *
- * This file contains unit tests of the BTree class used to keep track of
- * etags during a crawl
+ * This file contains unit tests of the BPlusTree class
  *
  * @author Chris Pollett chris@pollett.org
  * @license https://www.gnu.org/licenses/ GPL3
@@ -46,7 +45,7 @@ use seekquarry\yioop\library\UnitTest;
  * Used to test insert, lookup, and deletion of key-value pairs on the B+-Tree.
  * @author Chris Pollett
  */
- class BPlusTreeTest extends UnitTest
+class BPlusTreeTest extends UnitTest
 {
     /**
      * Test directory to hold btree used for these unit tests
diff --git a/tests/IndexDocumentBundleTest.php b/tests/IndexDocumentBundleTest.php
index c1eedca6b..5015c3b19 100644
--- a/tests/IndexDocumentBundleTest.php
+++ b/tests/IndexDocumentBundleTest.php
@@ -44,7 +44,7 @@ use seekquarry\yioop\library\UnitTest;
  * Used to test that the IndexDocumentBundle class can properly add and
  * retrieve documents. Check its prepareMethod correctly deduplicates
  * documents before inverted index creation. Tests inverted index creation
- * and adding terms to IndexDocumentBundle's BPlusTree. Check look up of
+ * and adding terms to IndexDocumentBundle's LSMTree. Check look up of
  * documents according to term.
  */
  class IndexDocumentBundleTest extends UnitTest
diff --git a/tests/IndexManagerTest.php b/tests/IndexManagerTest.php
index 2c6ff61dc..8dba23ae9 100644
--- a/tests/IndexManagerTest.php
+++ b/tests/IndexManagerTest.php
@@ -103,8 +103,7 @@ use seekquarry\yioop\library\UnitTest;
     {
         $version_new = IndexManager::getVersion(self::TEST_DIR . "/".
             self::NEW_BUNDLE);
-        echo $version_new;
-        $this->assertEqual($version_new, "3.3", "Version 3.3 index detected");
+        $this->assertEqual($version_new, "4.0", "Version 4.0 index detected");
     }
     /**
      * Tests if IndexManager can return the dictionary information about a
diff --git a/tests/LSMTreeTest.php b/tests/LSMTreeTest.php
new file mode 100644
index 000000000..befaf32b1
--- /dev/null
+++ b/tests/LSMTreeTest.php
@@ -0,0 +1,292 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2024  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * This file contains unit tests of the LSMTree class used to keep track of
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license https://www.gnu.org/licenses/ GPL3
+ * @link https://www.seekquarry.com/
+ * @copyright 2009 - 2024
+ * @filesource
+ */
+namespace seekquarry\yioop\tests;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\models\Model;
+use seekquarry\yioop\library\LSMTree;
+use seekquarry\yioop\library\PackedTableTools;
+use seekquarry\yioop\library\UnitTest;
+
+/**
+ * Yioop LSMTree Unit Class
+ *
+ * Used to test insert, lookup, and deletion of key-value pairs in a
+ * Log-structured Merge Tree.
+ * @author Chris Pollett
+ */
+class LSMTreeTest extends UnitTest
+{
+    /**
+     * Test directory to hold LSMTree used for these unit tests
+     */
+    const TEST_DIR = '/test_files/lsm_tree_test';
+    /**
+     * Folder names to use for test LSMTree
+     * @param string
+     */
+    public $table_dirs;
+    /**
+     * Minimum degree is set to 2 and the number of key-value pairs is set to 25
+     */
+    public function setUp()
+    {
+        $this->table_dirs = [];
+    }
+    /**
+     * Delete the LSMTree files created during the test
+     */
+    public function tearDown()
+    {
+        $model = new Model();
+        foreach ($this->table_dirs as $table_dir) {
+            $model->db->unlinkRecursive($table_dir, true);
+        }
+        $this->table_dirs = [];
+    }
+    /**
+     * Used to create a single hash table in the folder
+     * TEST_DIR . $max_items_per_file which allows at most
+     * $max_items_per_file to be stored in a bucket
+     *
+     * @param int $max_items_per_file number of items allowed to be stored in a
+     *  bucket
+     */
+    public function createTree($format, $max_file_size, $block_factor,
+        $suffix = "")
+    {
+        $key_size = $format["PRIMARY KEY"][1];
+        $table_dir = __DIR__ . self::TEST_DIR .
+            "_{$key_size}_{$max_file_size}_{$block_factor}_$suffix";
+        $this->table_dirs[] = $table_dir;
+        return new LSMTree($table_dir, $format, $max_file_size,
+            $block_factor);
+    }
+    /**
+     * Test putting items in lsm-tree and
+     * then seeing if the items can be retrieved
+     */
+    public function simplePutGetTierTestCase()
+    {
+        for ($max_file_size = 20; $max_file_size < 60; $max_file_size += 30) {
+            for ($block_factor = 1; $block_factor < 3; $block_factor++) {
+                for ($key_len = 16; $key_len > 5; $key_len -= 8) {
+                    for ($cols = 0; $cols < 3; $cols++) {
+                        $format = ["PRIMARY KEY" => ["KEY", $key_len],
+                            "VALUE" => "TEXT"];
+                        for ($j = 0; $j < $cols; $j++) {
+                            $format["COL$j"] = "INT";
+                        }
+                        $lsm_tree = $this->createTree($format, $max_file_size,
+                            $block_factor, "{$key_len}_$cols");
+                        for ($i = 0; $i < 50; $i++) {
+                            $key = str_pad("key$i", $key_len, "0",
+                                STR_PAD_LEFT);
+                            $entry = ["KEY" => $key, "VALUE" => "value$i"];
+                            for ($j = 0; $j < $cols; $j++) {
+                                $entry["COL$j"] = $i;
+                            }
+                            $lsm_tree->put($entry);
+                        }
+                        $lsm_tree->flush();
+                        for  ($i = 0; $i < 50; $i++) {
+                            $key = str_pad("key$i", $key_len, "0",
+                                STR_PAD_LEFT);
+                            $values = $lsm_tree->get($key);
+                            $this->assertEqual("value$i", $values[0]["VALUE"],
+                                "{$i}th insert key length $key_len, pad ".
+                                "columns $j into single LSTM Tier ".
+                                "{$values[0]["VALUE"]} should be retrieved ".
+                                "as value$i");
+                        }
+                    }
+                }
+            }
+        }
+    }
+    /**
+     * Checks that the correct number of block folders are created when
+     * inserting several items into am LSMTree with a small block factor
+     * and max file size
+     */
+    public function blockFactorTestCase()
+    {
+        $max_file_size = 20;
+        $block_factor = 2;
+        $key_len = 16;
+        $cols = 2;
+        $format = ["PRIMARY KEY" => ["KEY", $key_len],
+            "VALUE" => "TEXT"];
+        $lsm_tree = $this->createTree($format, $max_file_size,
+            $block_factor, "{$key_len}");
+        for ($i = 0; $i < 15; $i++) {
+            $key = str_pad("key$i", $key_len, "0",
+                STR_PAD_LEFT);
+            $entry = ["KEY" => $key, "VALUE" => "value$i"];
+            $lsm_tree->put($entry);
+        }
+        $this->assertTrue(file_exists($lsm_tree->folder .
+            "/Tier0/A/F00000000000key12") && !file_exists($lsm_tree->folder .
+                "/Tier0/A/F00000000000key14"),
+            "Correct number of block folders created");
+    }
+    /**
+     * Checks that the correct number of data files are created when
+     * inserting several items into am LSMTree with a small max file size
+     */
+    public function maxFileSizeTestCase()
+    {
+        $max_file_size = 50;
+        $block_factor = 10000;
+        $key_len = 16;
+        $cols = 2;
+        $format = ["PRIMARY KEY" => ["KEY", $key_len],
+            "VALUE" => "TEXT"];
+        $lsm_tree = $this->createTree($format, $max_file_size,
+            $block_factor, "{$key_len}");
+        for ($i = 0; $i < 15; $i++) {
+            $key = str_pad("key$i", $key_len, "0",
+                STR_PAD_LEFT);
+            $entry = ["KEY" => $key, "VALUE" => "value$i"];
+            $lsm_tree->put($entry);
+        }
+        $block_folder = $lsm_tree->folder . "/Tier0/A/F000000000000key0";
+        $this->assertTrue(file_exists("$block_folder/D00000000000key13") &&
+            !file_exists("$block_folder/D00000000000key14"),
+            "Correct number of data files created");
+    }
+    /**
+     * Tests that values for identical keys are correctly merged when merge
+     * two slots within a tier.
+     */
+    public function mergeRecordsTierTestCase()
+    {
+        $max_file_size = 50;
+        $block_factor = 2;
+        $key_len = 16;
+        $cols = 2;
+        $format = ["PRIMARY KEY" => ["KEY", $key_len],
+            "VALUE" => "TEXT"];
+        for ($j = 0; $j < $cols; $j++) {
+            $format["COL$j"] = "INT";
+        }
+        $lsm_tree = $this->createTree($format, $max_file_size,
+            $block_factor, "{$key_len}_$cols");
+        foreach ([["A", 0, 10], ["B", 5, 15]] as $slot_info) {
+            $lsm_tree->selectPutSlot($slot_info[0]);
+            for ($i = $slot_info[1]; $i < $slot_info[2]; $i++) {
+                $key = str_pad("key$i", $key_len, "0",
+                    STR_PAD_LEFT);
+                $entry = ["KEY" => $key, "VALUE" => "value{$slot_info[0]}$i"];
+                for ($j = 0; $j < $cols; $j++) {
+                    $entry["COL$j"] = $i;
+                }
+                $lsm_tree->put($entry);
+            }
+            $lsm_tree->flush();
+        }
+        $lsm_tree->mergeTiers();
+        $key = str_pad("key10", $key_len, "0", STR_PAD_LEFT);
+        $values = $lsm_tree->get($key);
+        for ($i = 0; $i < 15; $i++) {
+            $key = str_pad("key$i", $key_len, "0", STR_PAD_LEFT);
+            $values = $lsm_tree->get($key);
+            if ($i < 10) {
+                $this->assertEqual("valueA$i", $values[0]["VALUE"],
+                    "Able to retrieve Slot A value $i");
+            }
+            if ($i > 4 && $i < 10) {
+                $this->assertEqual("valueB$i", $values[1]["VALUE"],
+                    "Able to retrieve Slot B value $i");
+            }
+            if ($i >= 10) {
+                $this->assertEqual("valueB$i", $values[0]["VALUE"],
+                    "Able to retrieve Slot B value $i");
+            }
+        }
+    }
+    /**
+     * Tests that tiers are correctly created and merged when many items are
+     * inserted into a tree with a small block factor and max file size.
+     * Test correctness by seeing if each value can be retrieved with its key.
+     */
+    public function mergeTiersTestCase()
+    {
+        $max_file_size = 50;
+        $block_factor = 2;
+        $key_len = 16;
+        $cols = 3;
+        $format = ["PRIMARY KEY" => ["KEY", $key_len],
+            "VALUE" => "TEXT"];
+        for ($j = 0; $j < $cols; $j++) {
+            $format["COL$j"] = "INT";
+        }
+        $lsm_tree = $this->createTree($format, $max_file_size,
+            $block_factor, "{$key_len}_$cols");
+        $slot = "B";
+        for ($i = 0; $i < 250; $i++) {
+            if ($i % 5 == 0) {
+                $slot = ($slot == "B") ? "A" : "B";
+                $lsm_tree->flush();
+                $lsm_tree->selectPutSlot($slot);
+            }
+            if ($i > 0 && $i % 10 == 0) {
+                $lsm_tree->mergeTiers();
+            }
+            $k = 10 * floor($i / 10);
+            $k += ($slot == "A") ? 2 * ($i % 10) : 2 * (($i % 10) - 5) + 1;
+            $key = str_pad("key$k", $key_len, "0", STR_PAD_LEFT);
+            $entry = ["KEY" => $key, "VALUE" => "value$k"];
+            for ($j = 0; $j < $cols; $j++) {
+                $entry["COL$j"] = $k;
+            }
+            $lsm_tree->put($entry);
+        }
+        $lsm_tree->flush();
+        $lsm_tree->mergeTiers();
+        $this->assertEqual($lsm_tree->getMaxTier(), 5,
+            "Final maxTiers is correct");
+        $tiers = [false, true, false, false, true, true];
+        for ($i = 0; $i <= 5; $i++) {
+            $this->assertEqual($lsm_tree->occupiedTier($i), $tiers[$i],
+                "{$i} th tier correctly occupied");
+        }
+        for ($i = 0; $i < 250; $i++) {
+            $key = str_pad("key$i", $key_len, "0", STR_PAD_LEFT);
+            $values = $lsm_tree->get($key);
+            $this->assertEqual("value$i", $values[0]["VALUE"],
+                "Able to retrieve {$i}th value");
+        }
+    }
+}
ViewGit