diff --git a/src/library/BPlusTree.php b/src/library/BPlusTree.php
index 0526c17dc..abef3e927 100644
--- a/src/library/BPlusTree.php
+++ b/src/library/BPlusTree.php
@@ -124,8 +124,9 @@ class BPlusTree
*/
public $insert_node_cache = [];
/**
- * Used to keep track if the file_handle to the archive cache has changed
- * since the last get operation
+ * Used to keep track of when this instance was created, as part of managing
+ * file handles expiration (could be set/updated externally to reflect
+ * some other instance using the BPlusTree)
*/
public $instance_time;
/**
@@ -775,6 +776,7 @@ class BPlusTree
* Add a value to an archive file $archive_filename of a B+-tree node
* @param string $archive_filename name of archive file
* @param string $value value to add
+ * @return array [offset into archive, length data saved]
*/
protected function addArchive($archive_filename, $value)
{
diff --git a/src/library/LinearHashTable.php b/src/library/LinearHashTable.php
index c4d7ff8a2..359e590b5 100644
--- a/src/library/LinearHashTable.php
+++ b/src/library/LinearHashTable.php
@@ -65,6 +65,8 @@ class LinearHashTable
];
/**
* File extension to use for hash index files for partitions
+ * These are the file buckets which have names based on portions of hash
+ * key values and store records for keys matching the bit pattern
*/
const HASH_INDEX_EXTENSION = ".hix";
/**
@@ -72,7 +74,8 @@ class LinearHashTable
*/
const INDEX_CACHE_SIZE = 100;
/**
- *
+ * For each archive partition, the file with the list of keys (not hashes
+ * of keys) that were added for this partition will be prefixed this string.
*/
const KEY_PREFIX = "key_";
/**
@@ -95,27 +98,41 @@ class LinearHashTable
*/
const PARTITION_SIZE_THRESHOLD = 2147483648;
/**
- *
+ * In memory copy of the active partition index. key => packed_row pairs
+ * for the active partition.
+ * @var index
*/
public $active_index;
/**
- *
+ * Field variable used as a cache for the file handle, file name, and
+ * time of the last archive file for a partition added to
+ * @var array
*/
public $add_archive_cache = [null, "", -1];
/**
- *
+ * Field variable used as a cache for the file handle, file name, and
+ * time of the last key index file for a partition added to
+ * (the key index file is just a sorted file of the keys that appear
+ * in a hash table partition). Its intended use is to aid in
+ * the creation of additional index structures on top of the linear
+ * hash table
+ * @var array
*/
public $add_key_archive_cache = [null, "", -1];
/**
- *
+ * Field variable used as a cache for the file handle, file name, and
+ * time of the last index hash bucket file was added to
+ * @var array
*/
public $add_index_cache = ["", [], -1];
/**
- *
+ * Field variable used as a cache for the file handle, file name, and
+ * time of the last archive file for a partition accessed for a value
+ * @var array
*/
public $get_archive_cache = [null, "", -1];
/**
- * Array of column names for the columns in a BPlusTree which
+ * Array of column names for the columns in a LinearHashTable which
* are of type BLOB or SERIAL
* @var array
*/
@@ -132,11 +149,16 @@ class LinearHashTable
*/
public $folder;
/**
- *
+ * Used to cached hash_buck_path => contents_of_hash_bucket pairs in
+ * to speed up retrieval and update.
+ * @var array
*/
public $index_buffer;
/**
- *
+ * Used to keep track of when this instance was created, as part of managing
+ * file handles expiration (could be set/updated externally to reflect
+ * some other instance using the table)
+ * @var int
*/
public $instance_time;
/**
@@ -163,7 +185,18 @@ class LinearHashTable
*/
public $table_tools;
/**
+ * Creates/Loads LinearHashTable having specified folder and parameters
*
+ * @param string $folder is the folder for storing the LinearHashTable files
+ * @param array $format the column names, keys and types for this
+ * LinearHashTable object
+ * @param int $max_items_per_file maximum number of items to store
+ * in a parition before making the next partition
+ * @param int $partition_size_threshold maximum length of a partition
+ * file in bytes before a new parition file should be started
+ * @param object $compressor_type
+ * seekquarry\yioop\library\compressors\Compressor object used to
+ * compress index files and blob items.
*/
public function __construct($folder, $format =
self::DEFAULT_PARAMETERS["FORMAT"],
@@ -235,7 +268,16 @@ class LinearHashTable
}
}
/**
+ * Returns the record associated with the given $key in the
+ * LinearHashTable
*
+ * @param string $key a key to look up
+ * @param array $fields which fields (columns) from the record to return
+ * @param bool $is_hash_key has the supplied $key already been hash using
+ * the LinearHashTable's hash function (the function used to determine
+ * the partition a key should be in)
+ * @return mixed array containing unpacked record associated with $key if
+ * it exists in the table, false otherwise
*/
public function get($key, $fields = [], $is_hash_key = false)
{
@@ -309,7 +351,14 @@ class LinearHashTable
return $out_data;
}
/**
+ * Returns the blob item from $archive_filename at $offset of length $len,
+ * uncompress the result
*
+ * @param string $archive_filename path to an archive node file for this
+ * LinearHashTable
+ * @param int $offset byte offset into archive node file file
+ * @param int $len length of blob item
+ * @return string uncompressed blob item from $archive_filename
*/
public function getArchive($archive_filename, $offset, $len)
{
@@ -404,10 +453,17 @@ class LinearHashTable
return false;
}
/**
+ * Used to add a new record or an array of new records to the
+ * LinearHashTable
*
- * @param array $row_or_rows
- * @param bool $is_hash_key
- * @param bool $allow_duplicates
+ * @param array $row_or_rows either array of record with fields given
+ * by this LinearHashTable's signature or an array of rows.
+ * @param bool $is_hash_key whether or not the key_field in the row or rows
+ * has already been hash to a string suitable for storage in this
+ * LinearHashTable
+ * @param bool $allow_duplicates whether to allow multiple records with
+ * same key or not.
+ * @return bool success (true) or not (false) for the insert
*/
public function put($row_or_rows, $is_hash_key = false,
$allow_duplicates = true)
@@ -422,14 +478,14 @@ class LinearHashTable
. " of %s.", $i, $num_rows);
$key = $row[$this->key_field] ?? false;
if ($key === false) {
- crawlLog("LinearHashtable Put Failed A");
+ crawlLog("LinearHashTable Put Failed A");
return false;
}
unset($row[$this->key_field]);
$value = $row;
$hash_key = ($is_hash_key) ? $key : $this->hashKey($key);
if (!$allow_duplicates && $this->exists($hash_key, false)) {
- crawlLog("LinearHashtable Put Failed B");
+ crawlLog("LinearHashTable Put Failed B");
return false;
}
$save_partition = $this->parameters["SAVE_PARTITION"];
@@ -460,7 +516,7 @@ class LinearHashTable
$blob = serialize($blob);
}
if (($add_info = $this->addArchive($blob)) === false) {
- crawlLog("LinearHashtable Put Failed C");
+ crawlLog("LinearHashTable Put Failed C");
return false;
}
list($offset, $len, $partition) = $add_info;
@@ -487,7 +543,7 @@ class LinearHashTable
$hash_key, $out_value, PackedTableTools::ADD_FILE_HANDLE)) {
$this->parameters['ACTIVE_COUNT']++;
} else {
- crawlLog("LinearHashtable Put Failed D");
+ crawlLog("LinearHashTable Put Failed D");
return false;
}
$i++;
@@ -496,7 +552,19 @@ class LinearHashTable
return true;
}
/**
+ * Save the active partition to $new_save_partition and advances the
+ * next save partition to be $new_save_partition + 1. If
+ * $new_save_partition ==0, $this->parameters["SAVE_PARTITION"] + 1 is
+ * used for $new_save_partition . If
+ * $new_save_partition <= $this->parameters["SAVE_PARTITION"], then
+ * this function returns false. Saving the active partition means
+ * moving key => record entries from the active index to their appropriate
+ * linear hash table buckets. The Archive file for the save partition is
+ * closed, and a new file handle for the next save partition file is opened.
*
+ * @param int $new_save_partition index of the save partition when starting
+ * a new one.
+ * @return bool success (true) or failure (false)
*/
public function advanceSavePartition($new_save_partition = 0)
{
@@ -528,7 +596,13 @@ class LinearHashTable
return true;
}
/**
- *
+ * Deletes a key or set of keys and their associated values from the
+ * linear hash table
+ * @param mixed $key_or_keys either the string of a key or an array of
+ * strings of keys to delete
+ * @param bool $is_hash_key whether or not the key or keys have already
+ * been hash using the linear hash table's hash function
+ * @return bool success (true) or failure (false) for deletion
*/
public function delete($key_or_keys, $is_hash_key = false)
{
@@ -587,14 +661,28 @@ class LinearHashTable
return true;
}
/**
- *
+ * Computes the fixed length hash value of a $key for the purposes of
+ * storing the key associated with a value in this lienar hash table.
+ * @param string $key to hash
+ * @return string fixed length result of hashing
*/
public function hashKey($key)
{
return md5($key, true);
}
/**
+ * Gets the file path for the bucket associated with $hash_key,
+ * assuming the number of items in the linear hash table is $count,
+ * $max_items_per_file is the maximum number of items per file.
*
+ * @param string $hash_key LHT hash of a key value
+ * @param int number of items to assume in table (defaults to the number
+ * currently stored)
+ * @param int $max_items_per_file maximum number of items per bucket
+ * (defaults to this parameter for the current LHT)
+ * @param bool $mkdir_if_not_exists make folders along path that don't
+ * exist already if true.
+ * @return string path to bucket
*/
public function getHashPath($hash_key, $count = -1,
$max_items_per_file = -1, $mkdir_if_not_exists = false)
@@ -643,8 +731,13 @@ class LinearHashTable
return $old_path;
}
/**
- * @param int
- * @param int $max_items_per_file
+ * Given an item $count, and a maximum number of items in a bucket for the
+ * LHT determines, the number of buckets, the maximum number of bits
+ * to be used from the hash function to determine bucket, 2^max_bits -1,
+ * and a threshold on when to use this number of bit versus one less.
+ * @param int $count number of items in LHT
+ * @param int $max_items_per_file max items per bucket
+ * @return array 4-tuple [num files, max_num_bit, 2^max_bits -1, threshold]
*/
public function bitStatistics($count, $max_items_per_file)
{
@@ -700,11 +793,18 @@ class LinearHashTable
}
}
/**
- * @param string $key
- * @param string $value
- * @param bool $is_hash_key
- * @param int $change_count
- * @param bool $bulk_insert
+ * Assuming a $value record where blob items have alreayd been replaced by
+ * their integer offsets into the archive paritition, inserts in the LHT
+ * a $value record according to this LHT's signature for $key.
+ *
+ * @param string $key key to use for insert
+ * @param string $value record accordin to this LHT's signature
+ * @param bool $is_hash_key whether or not $key has already been hash using
+ * this LHT's hash function
+ * @param int $change_count for much the size of the LHT should change by
+ * @param bool $bulk_insert whether the current insert is one of many
+ * (affects how long items are kept in memory flushing)
+ * @return bool success of insert (true) or failure (false)
*/
protected function putIndex($key, $value, $is_hash_key = false,
$change_count = 0, $bulk_insert = false)
@@ -732,7 +832,8 @@ class LinearHashTable
return false;
}
/**
- *
+ * Handles merging two buckets into one, which might happen after a delete
+ * on the LHT
*/
protected function mergeMigrate()
{
@@ -745,7 +846,8 @@ class LinearHashTable
$this->unlinkHashPath($migrate_to_path_high);
}
/**
- *
+ * Handles splitting one bucket into two, which might happen after an
+ * insert on the LHT
*/
protected function splitMigrate()
{
@@ -754,7 +856,8 @@ class LinearHashTable
$this->unlinkHashPath($migrate_from_path);
}
/**
- * @param string $hash_path
+ * Unlinks the index file at the given $hash_path if it exists
+ * @param string $hash_path path to index file
*/
protected function unlinkHashPath($hash_path)
{
@@ -764,8 +867,14 @@ class LinearHashTable
}
}
/**
- * @param int $count
- * @param int $max_items_per_file
+ * Given $count items in LHT with bucket size $max_items_per_file, and
+ * assuming a split/merge just occurred, returns the path of the
+ * two new buckets if a split, and the one new merged bucket if a merge
+ *
+ * @param int $count number of items in LHT
+ * @param int $max_items_per_file maximum number of items/bucket
+ * @return array [path to merge bucket, path to split bucket low,
+ * path to split bucket high]
*/
protected function computeMigratePaths($count = -1,
$max_items_per_file = -1)
@@ -791,8 +900,11 @@ class LinearHashTable
$migrate_to_path_high];
}
/**
- * @param string $hash_path
- * @param int $new_count
+ * Reads in index records from $hash_path, then reinserts them into LHT
+ * under the assumption that the size of the LHT is $new_count
+ * (defaults to current size + 1)
+ * @param string $hash_path location of bucket to read records from
+ * @param int $new_count size of LHT to assume when reinserting records
*/
protected function insertRecordsFromIndex($hash_path, $new_count = -1)
{
@@ -813,7 +925,10 @@ class LinearHashTable
}
}
/**
- * @param string $value
+ * Add a $value to save partition of this linear hash table
+ * @param string $value value to add
+ * @return array [offset into archive, length data saved,
+ * index of partition]
*/
protected function addArchive($value)
{
@@ -842,7 +957,10 @@ class LinearHashTable
return [$offset, $len, $save_partition];
}
/**
- * @param string $key
+ * Add a key (not its hash) to the archive file for keys for the
+ * current save partition. This is used to keep track of all the keys
+ * stored in a partition
+ * @param string $key key to add
*/
protected function addKeyArchive($key)
{
@@ -870,11 +988,17 @@ class LinearHashTable
fwrite($fh, $encode, strlen($encode));
}
/**
- * @param string $hash_key
- * @param string $value
- * @param int $count
- * @param bool $mode
- * @return bool
+ * Adds the $hash_key, $value pair to the linear hash table bucket
+ * (index file) for $hash_key under the assumption that $count items are
+ * in the hash table. If bulk_mode is enabled then the index file
+ * is kept in memory rather than immeidately flushed to disk.
+ *
+ * @param string $hash_key key to determine hash table bucket (index file)
+ * @param string $value packed table data to associate with key
+ * @param int $count number of items to assume in linear hash table.
+ * If -1 then use based on the saved parameter count
+ * @param bool $bulk_mode whether to immediately flush index/bucket to disk
+ * @return bool success (true) or failure (false) of addition
*/
protected function addIndex($hash_key, $value, $count = -1,
$bulk_mode = false)
diff --git a/src/library/PartitionDocumentBundle.php b/src/library/PartitionDocumentBundle.php
index 6ada95f0b..0db06be20 100644
--- a/src/library/PartitionDocumentBundle.php
+++ b/src/library/PartitionDocumentBundle.php
@@ -140,9 +140,9 @@ class PartitionDocumentBundle
*/
public $index_cache_size;
/**
- * hi-res time the file handle for add or getting filehandle blob was
- * created. Used to determine if the file_handle needs to be closed, because
- * active partition changed
+ * Used to keep track of when this instance was created, as part of managing
+ * file handles expiration (could be set/updated externally to reflect
+ * some other instance using the bundle)
* @var int
*/
public $instance_time;
@@ -176,8 +176,10 @@ class PartitionDocumentBundle
* PartitionDocumentBundle
* @param array $format the column names, keys and types for this
* PartitionDocumentBundle object
- * @param int $partition_size_threshold maximum number of items to store
+ * @param int $max_items_per_file maximum number of items to store
* in a parition before making the next partition
+ * @param int $partition_size_threshold maximum length of a partition
+ * file in bytes before a new parition file should be started
* @param object $compressor_type
* seekquarry\yioop\library\compressors\Compressor object used to
* compress record files and blob items.