Finish LinearHashTable documentation, a=chris

Chris Pollett [2021-09-21 18:Sep:st]
Finish LinearHashTable documentation, a=chris
Filename
src/library/BPlusTree.php
src/library/LinearHashTable.php
src/library/PartitionDocumentBundle.php
diff --git a/src/library/BPlusTree.php b/src/library/BPlusTree.php
index 0526c17dc..abef3e927 100644
--- a/src/library/BPlusTree.php
+++ b/src/library/BPlusTree.php
@@ -124,8 +124,9 @@ class BPlusTree
      */
     public $insert_node_cache = [];
     /**
-     * Used to keep track if the file_handle to the archive cache has changed
-     * since the last get operation
+     * Used to keep track of when this instance was created, as part of managing
+     * file handles expiration (could be set/updated externally to reflect
+     * some other instance using the BPlusTree)
      */
     public $instance_time;
     /**
@@ -775,6 +776,7 @@ class BPlusTree
      * Add a value to an archive file $archive_filename of a B+-tree node
      * @param string $archive_filename name of archive file
      * @param string $value value to add
+     * @return array [offset into archive, length data saved]
      */
     protected function addArchive($archive_filename, $value)
     {
diff --git a/src/library/LinearHashTable.php b/src/library/LinearHashTable.php
index c4d7ff8a2..359e590b5 100644
--- a/src/library/LinearHashTable.php
+++ b/src/library/LinearHashTable.php
@@ -65,6 +65,8 @@ class LinearHashTable
     ];
     /**
      * File extension to use for hash index files for partitions
+     * These are the file buckets which have names based on portions of hash
+     * key values and store records for keys matching the bit pattern
      */
     const HASH_INDEX_EXTENSION = ".hix";
     /**
@@ -72,7 +74,8 @@ class LinearHashTable
      */
     const INDEX_CACHE_SIZE = 100;
     /**
-     *
+     * For each archive partition, the file with the list of keys (not hashes
+     * of keys) that were added for this partition will be prefixed this string.
      */
     const KEY_PREFIX = "key_";
     /**
@@ -95,27 +98,41 @@ class LinearHashTable
      */
     const PARTITION_SIZE_THRESHOLD = 2147483648;
     /**
-     *
+     * In memory copy of the active partition index. key => packed_row pairs
+     * for the active partition.
+     * @var index
      */
     public $active_index;
     /**
-     *
+     * Field variable used as a cache for the file handle, file name, and
+     * time of the last archive file for a partition added to
+     * @var array
      */
     public $add_archive_cache = [null, "", -1];
     /**
-     *
+     * Field variable used as a cache for the file handle, file name, and
+     * time of the last key index file for a partition added to
+     * (the key index file is just a sorted file of the keys that appear
+     * in a hash table partition). Its intended use is to aid in
+     * the creation of additional index structures on top of the linear
+     * hash table
+     * @var array
      */
     public $add_key_archive_cache = [null, "", -1];
     /**
-     *
+     * Field variable used as a cache for the file handle, file name, and
+     * time of the last index hash bucket file was added to
+     * @var array
      */
     public $add_index_cache = ["", [], -1];
     /**
-     *
+     * Field variable used as a cache for the file handle, file name, and
+     * time of the last archive file for a partition accessed for a value
+     * @var array
      */
     public $get_archive_cache = [null, "", -1];
     /**
-     * Array of column names for the columns in a BPlusTree which
+     * Array of column names for the columns in a LinearHashTable which
      * are of type BLOB or SERIAL
      * @var array
      */
@@ -132,11 +149,16 @@ class LinearHashTable
      */
     public $folder;
     /**
-     *
+     * Used to cached hash_buck_path => contents_of_hash_bucket pairs in
+     * to speed up retrieval and update.
+     * @var array
      */
     public $index_buffer;
     /**
-     *
+     * Used to keep track of when this instance was created, as part of managing
+     * file handles expiration (could be set/updated externally to reflect
+     * some other instance using the table)
+     * @var int
      */
     public $instance_time;
     /**
@@ -163,7 +185,18 @@ class LinearHashTable
      */
     public $table_tools;
     /**
+     * Creates/Loads LinearHashTable having specified folder and parameters
      *
+     * @param string $folder is the folder for storing the LinearHashTable files
+     * @param array $format the column names, keys and types for this
+     *  LinearHashTable object
+     * @param int $max_items_per_file maximum number of items to store
+     *  in a parition before making the next partition
+     * @param int $partition_size_threshold maximum length of a partition
+     *  file in bytes before a new parition file should be started
+     * @param object $compressor_type
+     *  seekquarry\yioop\library\compressors\Compressor object used to
+     *  compress index files and blob items.
      */
     public function __construct($folder, $format =
         self::DEFAULT_PARAMETERS["FORMAT"],
@@ -235,7 +268,16 @@ class LinearHashTable
         }
     }
     /**
+     * Returns the record associated with the given $key in the
+     * LinearHashTable
      *
+     * @param string $key a key to look up
+     * @param array $fields which fields (columns) from the record to return
+     * @param bool $is_hash_key has the supplied $key already been hash using
+     *  the LinearHashTable's hash function (the function used to determine
+     *  the partition a key should be in)
+     * @return mixed array containing unpacked record associated with $key if
+     *  it exists in the table, false otherwise
      */
     public function get($key, $fields = [], $is_hash_key = false)
     {
@@ -309,7 +351,14 @@ class LinearHashTable
         return $out_data;
     }
     /**
+     * Returns the blob item from $archive_filename at $offset of length $len,
+     * uncompress the result
      *
+     * @param string $archive_filename path to an archive node file for this
+     *  LinearHashTable
+     * @param int $offset byte offset into archive node file file
+     * @param int $len length of blob item
+     * @return string uncompressed blob item from $archive_filename
      */
     public function getArchive($archive_filename, $offset, $len)
     {
@@ -404,10 +453,17 @@ class LinearHashTable
         return false;
     }
     /**
+     * Used to add a new record or an array of new records to the
+     * LinearHashTable
      *
-     * @param array $row_or_rows
-     * @param bool $is_hash_key
-     * @param bool $allow_duplicates
+     * @param array $row_or_rows either array of record with fields given
+     *  by this LinearHashTable's signature or an array of rows.
+     * @param bool $is_hash_key whether or not the key_field in the row or rows
+     *  has already been hash to a string suitable for storage in this
+     *  LinearHashTable
+     * @param bool $allow_duplicates whether to allow multiple records with
+     *  same key or not.
+     * @return bool success (true) or not (false) for the insert
      */
     public function put($row_or_rows, $is_hash_key = false,
         $allow_duplicates = true)
@@ -422,14 +478,14 @@ class LinearHashTable
                 . " of %s.", $i, $num_rows);
             $key = $row[$this->key_field] ?? false;
             if ($key === false) {
-                crawlLog("LinearHashtable Put Failed A");
+                crawlLog("LinearHashTable Put Failed A");
                 return false;
             }
             unset($row[$this->key_field]);
             $value = $row;
             $hash_key = ($is_hash_key) ? $key : $this->hashKey($key);
             if (!$allow_duplicates && $this->exists($hash_key, false)) {
-                crawlLog("LinearHashtable Put Failed B");
+                crawlLog("LinearHashTable Put Failed B");
                 return false;
             }
             $save_partition = $this->parameters["SAVE_PARTITION"];
@@ -460,7 +516,7 @@ class LinearHashTable
                         $blob = serialize($blob);
                     }
                     if (($add_info = $this->addArchive($blob)) === false) {
-                        crawlLog("LinearHashtable Put Failed C");
+                        crawlLog("LinearHashTable Put Failed C");
                         return false;
                     }
                     list($offset, $len, $partition) = $add_info;
@@ -487,7 +543,7 @@ class LinearHashTable
                 $hash_key, $out_value, PackedTableTools::ADD_FILE_HANDLE)) {
                 $this->parameters['ACTIVE_COUNT']++;
             } else {
-                crawlLog("LinearHashtable Put Failed D");
+                crawlLog("LinearHashTable Put Failed D");
                 return false;
             }
             $i++;
@@ -496,7 +552,19 @@ class LinearHashTable
         return true;
     }
     /**
+     * Save the active partition to $new_save_partition and advances the
+     * next save partition to be $new_save_partition + 1. If
+     * $new_save_partition ==0, $this->parameters["SAVE_PARTITION"] + 1 is
+     * used for $new_save_partition . If
+     * $new_save_partition <= $this->parameters["SAVE_PARTITION"], then
+     * this function returns false. Saving the active partition means
+     * moving key => record entries from the active index to their appropriate
+     * linear hash table buckets. The Archive file for the save partition is
+     * closed, and a new file handle for the next save partition file is opened.
      *
+     * @param int $new_save_partition index of the save partition when starting
+     *  a new one.
+     * @return bool success (true) or failure (false)
      */
     public function advanceSavePartition($new_save_partition = 0)
     {
@@ -528,7 +596,13 @@ class LinearHashTable
         return true;
     }
     /**
-     *
+     * Deletes a key or set of keys and their associated values from the
+     * linear hash table
+     * @param mixed $key_or_keys either the string of a key or an array of
+     *  strings of keys to delete
+     * @param bool $is_hash_key whether or not the key or keys have already
+     *  been hash using the linear hash table's hash function
+     * @return bool success (true) or failure (false) for deletion
      */
     public function delete($key_or_keys, $is_hash_key = false)
     {
@@ -587,14 +661,28 @@ class LinearHashTable
         return true;
     }
     /**
-     *
+     * Computes the fixed length hash value of a $key for the purposes of
+     * storing the key associated with a value in this lienar hash table.
+     * @param string $key to hash
+     * @return string fixed length result of hashing
      */
     public function hashKey($key)
     {
         return md5($key, true);
     }
     /**
+     * Gets the file path for the bucket associated with $hash_key,
+     * assuming the number of items in the linear hash table is $count,
+     * $max_items_per_file is the maximum number of items per file.
      *
+     * @param string $hash_key LHT hash of a key value
+     * @param int number of items to assume in table (defaults to the number
+     *  currently stored)
+     * @param int $max_items_per_file maximum number of items per bucket
+     *  (defaults to this parameter for the current LHT)
+     * @param bool $mkdir_if_not_exists make folders along path that don't
+     *  exist already if true.
+     * @return string path to bucket
      */
     public function getHashPath($hash_key, $count = -1,
         $max_items_per_file = -1, $mkdir_if_not_exists = false)
@@ -643,8 +731,13 @@ class LinearHashTable
         return $old_path;
     }
     /**
-     * @param int
-     * @param int $max_items_per_file
+     * Given an item $count, and a maximum number of items in a bucket for the
+     * LHT determines, the number of buckets, the maximum number of bits
+     * to be used from the hash function to determine bucket, 2^max_bits -1,
+     * and a threshold on when to use this number of bit versus one less.
+     * @param int $count number of items in LHT
+     * @param int $max_items_per_file max items per bucket
+     * @return array 4-tuple [num files, max_num_bit, 2^max_bits -1, threshold]
      */
     public function bitStatistics($count, $max_items_per_file)
     {
@@ -700,11 +793,18 @@ class LinearHashTable
         }
     }
     /**
-     * @param string $key
-     * @param string $value
-     * @param bool $is_hash_key
-     * @param int $change_count
-     * @param bool $bulk_insert
+     * Assuming a $value record where blob items have alreayd been replaced by
+     * their integer offsets into the archive paritition, inserts in the LHT
+     * a $value record according to this LHT's signature for $key.
+     *
+     * @param string $key key to use for insert
+     * @param string $value record accordin to this LHT's signature
+     * @param bool $is_hash_key whether or not $key has already been hash using
+     *  this LHT's hash function
+     * @param int $change_count for much the size of the LHT should change by
+     * @param bool $bulk_insert whether the current insert is one of many
+     * (affects how long items are kept in memory flushing)
+     * @return bool success of insert (true) or failure (false)
      */
     protected function putIndex($key, $value, $is_hash_key = false,
         $change_count = 0, $bulk_insert = false)
@@ -732,7 +832,8 @@ class LinearHashTable
         return false;
     }
     /**
-     *
+     * Handles merging two buckets into one, which might happen after a delete
+     * on the LHT
      */
     protected function mergeMigrate()
     {
@@ -745,7 +846,8 @@ class LinearHashTable
         $this->unlinkHashPath($migrate_to_path_high);
     }
     /**
-     *
+     * Handles splitting one bucket into two, which might happen after an
+     * insert on the LHT
      */
     protected function splitMigrate()
     {
@@ -754,7 +856,8 @@ class LinearHashTable
         $this->unlinkHashPath($migrate_from_path);
     }
     /**
-     * @param string $hash_path
+     * Unlinks the index file at the given $hash_path if it exists
+     * @param string $hash_path path to index file
      */
     protected function unlinkHashPath($hash_path)
     {
@@ -764,8 +867,14 @@ class LinearHashTable
         }
     }
     /**
-     * @param int $count
-     * @param int $max_items_per_file
+     * Given $count items in LHT with bucket size $max_items_per_file, and
+     * assuming a split/merge just occurred, returns the path of the
+     * two new buckets if a split, and the one new merged bucket if a merge
+     *
+     * @param int $count number of items in LHT
+     * @param int $max_items_per_file maximum number of items/bucket
+     * @return array [path to merge bucket, path to split bucket low,
+     *  path to split bucket high]
      */
     protected function computeMigratePaths($count = -1,
         $max_items_per_file = -1)
@@ -791,8 +900,11 @@ class LinearHashTable
             $migrate_to_path_high];
     }
     /**
-     * @param string $hash_path
-     * @param int $new_count
+     * Reads in index records from $hash_path, then reinserts them into LHT
+     * under the assumption that the size of the LHT is $new_count
+     * (defaults to current size + 1)
+     * @param string $hash_path location of bucket to read records from
+     * @param int $new_count size of LHT to assume when reinserting records
      */
     protected function insertRecordsFromIndex($hash_path, $new_count = -1)
     {
@@ -813,7 +925,10 @@ class LinearHashTable
         }
     }
     /**
-     * @param string $value
+     * Add a $value to save partition of this linear hash table
+     * @param string $value value to add
+     * @return array [offset into archive, length data saved,
+     *  index of partition]
      */
     protected function addArchive($value)
     {
@@ -842,7 +957,10 @@ class LinearHashTable
         return [$offset, $len, $save_partition];
     }
     /**
-     * @param string $key
+     * Add a key (not its hash) to the archive file for keys for the
+     * current save partition. This is used to keep track of all the keys
+     * stored in a partition
+     * @param string $key key to add
      */
     protected function addKeyArchive($key)
     {
@@ -870,11 +988,17 @@ class LinearHashTable
         fwrite($fh, $encode, strlen($encode));
     }
     /**
-     * @param string $hash_key
-     * @param string $value
-     * @param int $count
-     * @param bool $mode
-     * @return bool
+     * Adds the $hash_key, $value pair to the linear hash table bucket
+     * (index file) for $hash_key under the assumption that $count items are
+     * in the hash table. If bulk_mode is enabled then the index file
+     * is kept in memory rather than immeidately flushed to disk.
+     *
+     * @param string $hash_key key to determine hash table bucket (index file)
+     * @param string $value packed table data to associate with key
+     * @param int $count number of items to assume in linear hash table.
+     *  If -1 then use based on the saved parameter count
+     * @param bool $bulk_mode whether to immediately flush index/bucket to disk
+     * @return bool success (true) or failure (false) of addition
      */
     protected function addIndex($hash_key, $value, $count = -1,
         $bulk_mode = false)
diff --git a/src/library/PartitionDocumentBundle.php b/src/library/PartitionDocumentBundle.php
index 6ada95f0b..0db06be20 100644
--- a/src/library/PartitionDocumentBundle.php
+++ b/src/library/PartitionDocumentBundle.php
@@ -140,9 +140,9 @@ class PartitionDocumentBundle
      */
     public $index_cache_size;
     /**
-     * hi-res time the file handle for add or getting filehandle blob was
-     * created. Used to determine if the file_handle needs to be closed, because
-     * active partition changed
+     * Used to keep track of when this instance was created, as part of managing
+     * file handles expiration (could be set/updated externally to reflect
+     * some other instance using the bundle)
      * @var int
      */
     public $instance_time;
@@ -176,8 +176,10 @@ class PartitionDocumentBundle
      *  PartitionDocumentBundle
      * @param array $format the column names, keys and types for this
      *  PartitionDocumentBundle object
-     * @param int $partition_size_threshold maximum number of items to store
+     * @param int $max_items_per_file maximum number of items to store
      *  in a parition before making the next partition
+     * @param int $partition_size_threshold maximum length of a partition
+     *  file in bytes before a new parition file should be started
      * @param object $compressor_type
      *  seekquarry\yioop\library\compressors\Compressor object used to
      *  compress record files and blob items.
ViewGit