Fix LSMTreeTest unit tests, add more documentation for LSMTree, fix cache in IndexDocumentBundle::getPostingsString

Chris Pollett [2024-01-17 23:Jan:th]
Fix LSMTreeTest unit tests, add more documentation for LSMTree, fix cache in IndexDocumentBundle::getPostingsString
Filename
src/library/IndexDocumentBundle.php
src/library/LSMTree.php
tests/LSMTreeTest.php
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index 1235b4d14..81835fab3 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -1533,12 +1533,14 @@ class IndexDocumentBundle implements CrawlConstants
     {
         static $file_handles = [];
         static $memory_limit = 0;
+        $max_cache_size = 500;
         if (!$memory_limit) {
             $memory_limit =
                 metricToInt(ini_get("memory_limit")) * C\MEMORY_FILL_FACTOR;
         }
-        if (memory_get_usage() > $memory_limit) {
-            $file_handles = []; /*just in case file handles causing
+        if (memory_get_usage() > $memory_limit ||
+            count($file_handles) > $max_cache_size) {
+            array_shift($file_handles); /*just in case file handles causing
                     memory leak */
         }
         if (empty($file_handles[$partition])) {
@@ -1552,6 +1554,8 @@ class IndexDocumentBundle implements CrawlConstants
             $file_handles[$partition] = $fh;
         } else {
             $fh = $file_handles[$partition];
+            unset($file_handles[$partition]); // move to front of queue
+            $file_handles[$partition] = $fh;
         }
         if ($fh && fseek($fh, $offset) == 0 && $len > 0) {
             $out = fread($fh, $len);
@@ -1639,7 +1643,7 @@ class IndexDocumentBundle implements CrawlConstants
      * POSITION_OFFSETS values. It also computes the of the frequencies of items
      * within the list of postings. This method is current only used for
      * active partition in an index (the one whose terms haven't yet been added
-     * to the B+-tree).
+     * to the LSMtree).
      *
      * @param array &$postings a reference to an array of posting lists for a
      *  term (this will be changed by this method)
diff --git a/src/library/LSMTree.php b/src/library/LSMTree.php
index 639ad42f5..8dc98f979 100644
--- a/src/library/LSMTree.php
+++ b/src/library/LSMTree.php
@@ -89,11 +89,13 @@ class LSMTree
      */
     public $max_file_size;
     /**
+     *
      * @var Tier
      */
     public $put_slot = null;
     /**
-     *
+     * PackedTableTools used to pack/unpack key/values records
+     * @var PackedTableTools
      */
     public $table_tools;
     /**
@@ -217,7 +219,9 @@ class LSMTree
         crawlLog("..End LSMTiers Merging Tiers..");
     }
     /**
+     * Deletes tier $tier from the LSMTree
      *
+     * @param int $tier tier to delete from LSMTree
      */
     public function emptyTier($tier)
     {
@@ -247,7 +251,11 @@ class LSMTree
         return $encoded_key . $out_values;
     }
     /**
+     * Merges two tier slots of the same tier $tier into a single tier slot
+     * at tier $tier + 1. If fewer than two slots filled at a given tier
+     * than does nothing.
      *
+     * @param int $tier tier to perform merging for
      */
     public function mergeTier($tier)
     {
@@ -327,7 +335,12 @@ class LSMTree
         return $rows;
     }
     /**
+     * Returns the values associated with a key in a given Tier of the
+     * LSMTree
      *
+     * @param int $tier tier to get values from
+     * @param string $key key to look up values for
+     * @return array values associated with $key (unpacked)
      */
     public function getTier($tier, $key)
     {
@@ -338,37 +351,17 @@ class LSMTree
         $slot = new Tier($slot_folder, $this->table_tools);
         return $slot->get($key);
     }
-    /**
-     * Save the operating parameters of this LSMTree
-     */
-    public function saveParameters()
-    {
-        $parameter_path = $this->folder . "/" . self::PARAMETERS_FILE;
-        file_put_contents($parameter_path, serialize($this->parameters),
-            LOCK_EX);
-    }
-    /**
-     * Returns the parameters (such as its signature, max keys per nodes, etc)
-     * used to configure the LSMTree stored at $folder
-     *
-     * @param string $folder file path to a stored LSMTree
-     * @return array configuration info about the LSMTree
-     */
-    public static function getParameterInfo($folder)
-    {
-        $parameter_path = $folder . "/" . self::PARAMETERS_FILE;
-        if(file_exists($parameter_path)) {
-            $parameters = unserialize(file_get_contents($parameter_path)) ?? [];
-            return $parameters;
-        } else {
-            return [];
-        }
-    }
 }
 /**
+ * Splits a string containing one row of data for the LSMTree into the
+ * key and a string for the values.
  *
+ * @param string $entry string encoded row to be split
+ * @param PackedTableTools $table_tools which has the format used to
+ *  encode the entry
+ * @return array [$key, $values (as a string)]
  */
-function entryToKeyValues($entry, $table_tools, $decode_key = false)
+function entryToKeyValues($entry, $table_tools)
 {
     $key_len = $table_tools->key_len;
     $key = substr($entry, 0, $key_len);
@@ -376,68 +369,75 @@ function entryToKeyValues($entry, $table_tools, $decode_key = false)
     return [$key, $values];
 }
 /**
- *
+ * Auxiliary Class used to manage a single Tier from the Logarithmic Merge Tree
+ * structure
  */
 class Tier
 {
     /**
-     *
+     * how many data files should be in a block folder before making a
+     * new block folder
+     * @var int
      */
     public $block_factor;
     /**
-     *
+     * For data that has not been flushed to disk, the first key in sorted
+     * order
+     * @var string
      */
     public $first_active_key;
     /**
-     *
+     * File path to where data in this tier is to be stored
+     * @var string
      */
     public $folder;
     /**
-     *
+     * @var int
      */
     public $iterator_folder_index;
     /**
-     *
+     * @var array
      */
     public $iterator_folders;
     /**
-     *
+     * @var int
      */
     public $iterator_file_index;
     /**
-     *
+     * @var array
      */
     public $iterator_files;
     /**
-     *
+     * @var int
      */
     public $iterator_entry_index;
     /**
-     *
+     * @var array
      */
     public $iterator_entries;
     /**
-     *
+     * @var int
      */
     public $max_file_size;
     /**
-     *
+     * Access mode for data in this tier: r - read, w - write
+     * @var string
      */
     public $mode;
     /**
-     *
+     * @var PackedTableTools
      */
     public $table_tools;
     /**
-     *
+     * @var string
      */
     private $records;
     /**
-     *
+     * @var string
      */
     private $active_filename;
     /**
-     *
+     * @var array
      */
     private static $cache = [];
     /**
@@ -584,7 +584,7 @@ class Tier
         }
         if (empty($this->records)) {
             list($this->first_active_key,) = entryToKeyValues($entry,
-                $table_tools, true);
+                $table_tools);
         }
         $separator = (strlen($this->records) > 0) ? "\xFF" : "";
         $this->records .= $separator . $encoded_entry;
@@ -617,7 +617,10 @@ class Tier
         return $haystack[$low];
     }
     /**
+     * Returns the first entry as a packed string in the LSMTree tier. Also
+     * resets iterator of this object.
      *
+     * @return string|bool first entry if exists, else false
      */
     public function firstEntry()
     {
@@ -649,7 +652,9 @@ class Tier
         return decode255($this->iterator_entries[0]);
     }
     /**
-     *
+     * Returns the next tier entry as a packed string iterated over by this
+     * Tier object.
+     * @return string|bool next entry if exists, else false
      */
     public function next()
     {
@@ -692,7 +697,8 @@ class Tier
         return decode255($this->iterator_entries[0]) ?? false;
     }
     /**
-     *
+     * Resets to the first entry of the tier, the iterator associated with
+     * the current Tier object.
      */
     public function reset()
     {
@@ -704,7 +710,14 @@ class Tier
         $this->iterator_entries = [];
     }
     /**
+     * Write a sequence of string records $lines into the file $filename,
+     * separating records with delimiter $delimiter. Deletes file from LRU
+     * cache of read files
      *
+     * @param string $filename name of file to write records to
+     * @param array $lines records to write to $filename
+     * @param string $delimiter string used to separate one records from the
+     *  next
      */
     function writeRecords($filename, $lines, $delimiter = "\n")
     {
@@ -713,7 +726,13 @@ class Tier
         unset(self::$cache[$name_hash]);
     }
     /**
-     *
+     * Returns the contents of a file managed by this LSMTree
+     * as a sequence of string records. Contents come from either
+     * a cache or from the filesystem. Has logic for LRU cache
+     * @param string $filename name of file to get records for
+     * @param string $delimiter delimeter used to separate individual
+     *   records
+     * @return array of string records
      */
     function readRecords($filename, $delimiter = "\n")
     {
@@ -726,7 +745,7 @@ class Tier
         }
         self::$cache[$name_hash] =
             explode($delimiter, file_get_contents($filename));
-        if (count(self::$cache[$name_hash]) >= LSMTRee::RECORD_CACHE_SIZE) {
+        if (count(self::$cache[$name_hash]) >= LSMTree::RECORD_CACHE_SIZE) {
             array_shift(self::$cache);
         }
         return self::$cache[$name_hash];
diff --git a/tests/LSMTreeTest.php b/tests/LSMTreeTest.php
index befaf32b1..d8fd5c194 100644
--- a/tests/LSMTreeTest.php
+++ b/tests/LSMTreeTest.php
@@ -157,8 +157,8 @@ class LSMTreeTest extends UnitTest
             $lsm_tree->put($entry);
         }
         $this->assertTrue(file_exists($lsm_tree->folder .
-            "/Tier0/A/F00000000000key12") && !file_exists($lsm_tree->folder .
-                "/Tier0/A/F00000000000key14"),
+            "/Tier0000/A/F00000000000key12") && !file_exists($lsm_tree->folder .
+                "/Tier0000/A/F00000000000key14"),
             "Correct number of block folders created");
     }
     /**
@@ -181,7 +181,7 @@ class LSMTreeTest extends UnitTest
             $entry = ["KEY" => $key, "VALUE" => "value$i"];
             $lsm_tree->put($entry);
         }
-        $block_folder = $lsm_tree->folder . "/Tier0/A/F000000000000key0";
+        $block_folder = $lsm_tree->folder . "/Tier0000/A/F000000000000key0";
         $this->assertTrue(file_exists("$block_folder/D00000000000key13") &&
             !file_exists("$block_folder/D00000000000key14"),
             "Correct number of data files created");
ViewGit