tweaks to try to speed up BPlusTree find operation, a=chris

Chris Pollett [2022-07-26 19:Jul:th]
tweaks to try to speed up BPlusTree find operation, a=chris
Filename
src/library/BPlusTree.php
src/locale/en_US/resources/Tokenizer.php
diff --git a/src/library/BPlusTree.php b/src/library/BPlusTree.php
index d5baddf52..682d3f78d 100644
--- a/src/library/BPlusTree.php
+++ b/src/library/BPlusTree.php
@@ -136,6 +136,7 @@ class BPlusTree
      * Used to keep track of when this instance was created, as part of managing
      * file handles expiration (could be set/updated externally to reflect
      * some other instance using the BPlusTree)
+     * @var int
      */
     public $instance_time;
     /**
@@ -143,6 +144,23 @@ class BPlusTree
      * @var string
      */
     public $key_field;
+    /**
+     * Last folder path of a find operation, provided this was cacheable
+     * @var string
+     */
+    public $last_find_folder = null;
+    /**
+     * Last encoded key used for a find operation, provided this was cacheable
+     * Used to avoid recomputing path down tree if will be the same.
+     * @var string
+     */
+    public $last_find_key = null;
+    /**
+     * First key of next node after returned node for the last find operation,
+     * provided this was cacheable
+     * @var string
+     */
+    public $last_find_next_key = null;
     /**
      * Storage for root node of the B-Tree
      * @var object
@@ -426,6 +444,9 @@ class BPlusTree
      */
     public function splitRootNode()
     {
+        $this->last_find_folder = null;
+        $this->last_find_key = null;
+        $this->last_find_next_key = null;
         $folder = $this->folder;
         $this->add_archive_cache = [null, "", -1];
         $this->get_archive_cache = [null, "", -1];
@@ -479,6 +500,9 @@ class BPlusTree
      */
     public function splitRecordsInLeaf($node_path, $node)
     {
+        $this->last_find_folder = null;
+        $this->last_find_key = null;
+        $this->last_find_next_key = null;
         $parent_folder = $this->getParentFolder($node_path);
         $archive_prefix = self::ARCHIVE_PREFIX;
         $temp_node_name = self::TEMP_NODE_NAME;
@@ -651,14 +675,22 @@ class BPlusTree
     public function find($key, $is_encoded_key = false)
     {
         $encode_key = ($is_encoded_key) ? $key : rawurlencode($key);
+        if (!empty($this->last_find_folder) && !empty($this->last_find_key) &&
+            $this->last_find_key <= $encode_key
+            && !empty($this->last_find_next_key) &&
+            $encode_key < $this->last_find_next_key) {
+                return $this->last_find_folder;
+        }
         $current_folder = $this->folder;
         $cache = & $this->tree_path_cache;
         $node_prefix = self::NODE_PREFIX;
         $least_node_name = self::LEAST_NODE_NAME;
         $node_prefix_and_key = self::NODE_PREFIX . $encode_key;
         while (isset($cache[$current_folder]) || is_dir($current_folder)) {
+            $current_prefix = "$current_folder/$node_prefix";
+            $len_current_prefix = strlen($current_prefix);
             if (!isset($cache[$current_folder])) {
-                $cache[$current_folder] = glob("$current_folder/$node_prefix*");
+                $cache[$current_folder] = glob("$current_prefix*");
             }
             $nodes = $cache[$current_folder];
             if (empty($nodes)) {
@@ -668,16 +700,45 @@ class BPlusTree
                 break;
             }
             $exact_node = "$current_folder/$node_prefix_and_key";
-            $first = true;
-            $next_node = "$current_folder/$least_node_name";
-            foreach ($nodes as $node) {
-                if (($first || $next_node < $node) &&
-                    $node <= $exact_node) {
-                    $first = false;
-                    $next_node = $node;
+            $least_node = "$current_folder/$least_node_name";
+            $first_index = 0;
+            $last_index = count($nodes) - 1;
+            $this->last_find_folder = null;
+            $this->last_find_key = null;
+            $this->last_find_next_key = null;
+            if ($exact_node < $nodes[$first_index]) {
+                $this->last_find_folder = $least_node;
+                $this->last_find_key = $encode_key;
+                $this->last_find_next_key = substr($nodes[$first_index],
+                    $len_current_prefix);
+                $current_folder = $least_node;
+            } else if ($exact_node == $nodes[$first_index]) {
+                if (!empty($nodes[$first_index + 1])) {
+                    $this->last_find_folder = $nodes[$first_index];
+                    $this->last_find_key = $encode_key;
+                    $this->last_find_next_key = substr($nodes[$first_index + 1],
+                        $len_current_prefix);
                 }
+                $current_folder = $nodes[$first_index];
+            } else if ($nodes[$last_index] <= $exact_node) {
+                $current_folder = $nodes[$last_index];
+            } else {
+                while ($first_index < $last_index) {
+                    $mid_index = ceil($first_index + $last_index);
+                    if ($nodes[$mid_index] > $exact_node) {
+                        $last_index = $mid_index - 1;
+                    } else {
+                        $first_index = $mid_index;
+                    }
+                }
+                if (!empty($nodes[$first_index + 1])) {
+                    $this->last_find_folder = $nodes[$first_index];
+                    $this->last_find_key = $encode_key;
+                    $this->last_find_next_key = substr($nodes[$first_index + 1],
+                        $len_current_prefix);
+                }
+                $current_folder = $nodes[$first_index];
             }
-            $current_folder = $next_node;
         }
         $return_folder =  null;
         if ($nodes == $current_folder) {
@@ -688,6 +749,10 @@ class BPlusTree
             }
             $cache[$current_folder] = $current_folder;
             $return_folder =  $current_folder;
+        } else {
+            $this->last_find_folder = null;
+            $this->last_find_key = null;
+            $this->last_find_next_key = null;
         }
         return $return_folder;
     }
@@ -702,7 +767,7 @@ class BPlusTree
     {
         $parent_folder = $this->getParentFolder($node_filename);
         $len = strlen($parent_folder);
-        $node_name = substr($node_filename, $len +1);
+        $node_name = substr($node_filename, $len + 1);
         if ($node_name == self::LEAST_NODE_NAME) {
              return $parent_folder. "/" . self::ARCHIVE_PREFIX .
                 self::LEAST_NODE_NAME;
diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php
index f1e1acfb3..3c104d8b6 100755
--- a/src/locale/en_US/resources/Tokenizer.php
+++ b/src/locale/en_US/resources/Tokenizer.php
@@ -1293,6 +1293,9 @@ class Tokenizer
     private static function stemPhrase($phrase)
     {
         $terms = mb_split("[[:space:]]", $phrase);
+        if (empty($terms)) {
+            return "";
+        }
         $stemmed_phrase = "";
         $space = "";
         foreach ($terms as $term) {
ViewGit