Tweaking file cache sizes related to indexing

Chris Pollett [2024-01-21 20:Jan:st]
Tweaking file cache sizes related to indexing
Filename
src/executables/ArcTool.php
src/library/IndexDocumentBundle.php
src/library/LSMTree.php
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php
index 0ee914edc..2c8345806 100755
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
@@ -1481,7 +1481,8 @@ EOD;
                 $old_next_partition = $next_partition;
                 $num_forks = min($save_partition - $next_partition,
                     $number_of_processes);
-                echo "Num forks:$num_forks, num processes $number_of_processes\n";
+                echo "Num forks:$num_forks, ".
+                    "num processes $number_of_processes\n";
                 for ($i = 0; $i < $num_forks; $i++) {
                     $process_partition = $next_partition + $i;
                     $process_options = sprintf($options, $i,
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index 81835fab3..d565177ae 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -158,6 +158,10 @@ class IndexDocumentBundle implements CrawlConstants
      * addPartitionPostingsDictionary
      */
     const POSTINGS_BUFFER_SIZE = 1000000;
+    /**
+     * Maximum number of posting slices to cache
+     */
+    const MAX_POSTING_CACHE_ITEMS = 100;
     /**
      * Holds property value pairs concerning the configuration of the
      * current IndexDocumentBundle
@@ -1533,13 +1537,12 @@ class IndexDocumentBundle implements CrawlConstants
     {
         static $file_handles = [];
         static $memory_limit = 0;
-        $max_cache_size = 500;
         if (!$memory_limit) {
             $memory_limit =
-                metricToInt(ini_get("memory_limit")) * C\MEMORY_FILL_FACTOR;
+                C\INDEX_FILE_MEMORY_LIMIT * C\MEMORY_FILL_FACTOR;
         }
         if (memory_get_usage() > $memory_limit ||
-            count($file_handles) > $max_cache_size) {
+            count($file_handles) > self::MAX_POSTING_CACHE_ITEMS) {
             array_shift($file_handles); /*just in case file handles causing
                     memory leak */
         }
diff --git a/src/library/LSMTree.php b/src/library/LSMTree.php
index 8dc98f979..5e38aa08b 100644
--- a/src/library/LSMTree.php
+++ b/src/library/LSMTree.php
@@ -231,7 +231,13 @@ class LSMTree
         $db->unlinkRecursive($tier_folder, true);
     }
     /**
+     * Compares the keys in two LSMTree entries and returns
+     * -1 is $entry_a < $entry_b, 0 if $entry_a == $entry_b,
+     * and 1 if $entry_a > $entry_b
      *
+     * @param string $entry_a first LSMTree entry
+     * @param string $entry_b first LSMTree entry
+     * @return int result of comparison as described above
      */
     public function compare($entry_a, $entry_b)
     {
@@ -239,7 +245,15 @@ class LSMTree
         return strncmp($entry_a, $entry_b, $key_len);
     }
     /**
+     * Combines two LSMTree entries with the same key value into a single
+     * entry. The values of an entry begin with number of items stored
+     * followed by items in the format specified by the LSMTree constructor.
+     * So the output items and the two number of items and concatenates the
+     * items themselves.
      *
+     * @param string $entry_a first entry to combine
+     * @param string $entry_b second entry to combine
+     * @return string cobined entry
      */
     public function mergeEntries($entry_a, $entry_b)
     {
@@ -745,7 +759,9 @@ class Tier
         }
         self::$cache[$name_hash] =
             explode($delimiter, file_get_contents($filename));
-        if (count(self::$cache[$name_hash]) >= LSMTree::RECORD_CACHE_SIZE) {
+        if (count(self::$cache[$name_hash]) >= LSMTree::RECORD_CACHE_SIZE ||
+            memory_get_usage() >
+            C\INDEX_FILE_MEMORY_LIMIT * C\MEMORY_FILL_FACTOR) {
             array_shift(self::$cache);
         }
         return self::$cache[$name_hash];
ViewGit