diff --git a/src/library/BPlusTree.php b/src/library/BPlusTree.php index 484a8552c..4638e8cec 100644 --- a/src/library/BPlusTree.php +++ b/src/library/BPlusTree.php @@ -54,7 +54,8 @@ class BPlusTree /** * Default parameters to use when constructing a BPlusTree */ - const DEFAULT_PARAMETERS = ["COMPRESSOR" => self::DEFAULT_COMPRESSOR, + const DEFAULT_PARAMETERS = ["NODE_COMPRESSOR" => self::DEFAULT_COMPRESSOR, + "BLOB_COMPRESSOR" => self::DEFAULT_COMPRESSOR, "FORMAT" => ["PRIMARY KEY" => ["KEY", -1], "VALUE" => "BLOB"], "MAX_KEYS" => self::MAX_KEYS ]; @@ -110,10 +111,16 @@ class BPlusTree public $blob_columns; /** * The seekquarry\yioop\library\compressors\Compressor object used to - * compress record files and blob items. + * compress node files. * @var object */ - public $compressor; + public $node_compressor; + /** + * The seekquarry\yioop\library\compressors\Compressor object used to + * compress blob columns. + * @var object + */ + public $blob_compressor; /** * Folder for storing the B-Tree files * @var string @@ -166,20 +173,26 @@ class BPlusTree * @param array $format the column names, keys and types for this * B+-Tree object * @param int $max_keys the maximum number of keys a node is allowed to hold - * @param object $compressor_type + * @param object $node_compressor_type + * seekquarry\yioop\library\compressors\Compressor object used to + * compress index node files. + * @param object $blob_compressor_type * seekquarry\yioop\library\compressors\Compressor object used to - * compress index node files and blob items. + * compress blob columns. */ public function __construct($folder, $format = self::DEFAULT_PARAMETERS["FORMAT"], $max_keys = self::MAX_KEYS, - $compressor_type = self::DEFAULT_COMPRESSOR) + $node_compressor_type = self::DEFAULT_COMPRESSOR, + $blob_compressor_type = self::DEFAULT_COMPRESSOR) { // ensure $max_keys odd $max_keys = ($max_keys % 2 == 1) ? $max_keys : $max_keys + 1; $initial_parameters = self::DEFAULT_PARAMETERS; $initial_parameters["MAX_KEYS"] = $max_keys; - $initial_parameters["COMPRESSOR"] = $compressor_type; - $this->compressor = new $compressor_type(); + $initial_parameters["NODE_COMPRESSOR"] = $node_compressor_type; + $initial_parameters["BLOB_COMPRESSOR"] = $blob_compressor_type; + $this->node_compressor = new $node_compressor_type(); + $this->blob_compressor = new $blob_compressor_type(); $initial_parameters["FORMAT"] = $format; $this->instance_time = hrtime(true); $this->folder = $folder; @@ -232,7 +245,7 @@ class BPlusTree $packed_table_format["LAST_BLOB_LEN"] = "INT"; } $this->table_tools = new PackedTableTools($packed_table_format, - $compressor_type); + $node_compressor_type); if ($changed_parameters) { $this->saveParameters(); } @@ -711,7 +724,7 @@ class BPlusTree { list($fh, $previous_archive_filename, $previous_instance_time) = $this->get_archive_cache; - $compressor = $this->compressor; + $blob_compressor = $this->blob_compressor; if (!$fh || $previous_archive_filename != $archive_filename || $previous_instance_time != $this->instance_time) { if ($fh) { @@ -721,14 +734,14 @@ class BPlusTree $previous_archive_filename = $archive_filename; $previous_instance_time = $this->instance_time; } - if (empty($compressor)) { - $compress_type = $this->parameters["COMPRESSOR"]; - $compressor = new $compress_type(); + if (empty($blob_compressor)) { + $blob_compress_type = $this->parameters["BLOB_COMPRESSOR"]; + $blob_compressor = new $blob_compress_type(); } $value = false; if (fseek($fh, $offset) == 0 && $len > 0) { $compressed_file = fread($fh, $len); - $value = $compressor->uncompress($compressed_file); + $value = $blob_compressor->uncompress($compressed_file); } $this->get_archive_cache = [$fh, $previous_archive_filename, $previous_instance_time]; @@ -813,11 +826,11 @@ class BPlusTree $previous_archive_filename = $archive_filename; $previous_instance_time = $this->instance_time; } - $compress_type = $this->parameters["COMPRESSOR"]; - $compressor = new $compress_type(); + $blob_compress_type = $this->parameters["BLOB_COMPRESSOR"]; + $blob_compressor = new $blob_compress_type(); fseek($fh, 0, SEEK_END); $offset = ftell($fh); - $compressed_value = $compressor->compress($value); + $compressed_value = $blob_compressor->compress($value); $len = strlen($compressed_value); $success = (fwrite($fh, $compressed_value, $len) !== false) ? true : false; @@ -862,7 +875,13 @@ class BPlusTree { $parameter_path = $folder . "/" . self::PARAMETERS_FILE; if(file_exists($parameter_path)) { - return unserialize(file_get_contents($parameter_path)); + $parameters = unserialize(file_get_contents($parameter_path)) ?? []; + if (!empty($parameters["COMPRESSOR"])) { + //original format didn't distinguish between compressor use + $parameters["NODE_COMPRESSOR"] ??= $parameters["COMPRESSOR"]; + $parameters["BLOB_COMPRESSOR"] ??= $parameters["COMPRESSOR"]; + } + return $parameters; } else { return []; } diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php index 9b23371f0..db01daee9 100644 --- a/src/library/IndexDocumentBundle.php +++ b/src/library/IndexDocumentBundle.php @@ -54,7 +54,7 @@ class IndexDocumentBundle implements CrawlConstants * The version of this IndexDocumentBundle. The lowest format number is * 3.0 as prior inverted index/document stores used IndexArchiveBundle's */ - const DEFAULT_VERSION = "3.0"; + const DEFAULT_VERSION = "3.1"; /** * Default values for the configuration parameters of an * IndexDocumentBundle @@ -241,28 +241,32 @@ class IndexDocumentBundle implements CrawlConstants file_put_contents($next_partition_path, $this->next_partition_to_add); } + $record_compressor = C\NS_COMPRESSORS . "NonCompressor"; + if ($this->archive_info['VERSION'] < 3.1) { + $record_compressor = C\NS_COMPRESSORS . "GzipCompressor"; + } + $blob_compressor = C\NS_COMPRESSORS . "GzipCompressor"; $this->documents = new PartitionDocumentBundle($dir_name . "/" . self::DOCUMENTS_FOLDER, ["PRIMARY KEY" => [self::DOC_ID, self::DOCID_LEN], self::SUMMARY => "SERIAL", self::PAGE => "SERIAL"], $num_docs_per_partition, PartitionDocumentBundle::PARTITION_SIZE_THRESHOLD, - C\NS_COMPRESSORS . "GzipCompressor"); + $record_compressor, $blob_compressor); if (!$read_only_archive) { $this->documents->index_cache_size = 1; } $this->doc_map_tools = new PackedTableTools([ "PRIMARY KEY" => ["DOC_KEYS", 24], "POS" => "INT", - "SCORE" => "FLOAT"], C\NS_COMPRESSORS . "GzipCompressor"); + "SCORE" => "FLOAT"], $record_compressor); $this->postings_tools = new PackedTableTools([ "PRIMARY KEY" => ["TERM", 16], "DOC_MAP_INDEX" => "INT", "FREQUENCY" => "INT", "POSITIONS_OFFSET" => "INT", - "POSITIONS_LEN" => "INT"], C\NS_COMPRESSORS . - "GzipCompressor"); + "POSITIONS_LEN" => "INT"], $record_compressor); $this->last_entries_tools = new PackedTableTools([ "PRIMARY KEY" => ["TERM", 16], "LAST_INDEX" => "INT", "LAST_OFFSET" => "INT", "NUM_OCCURRENCES" => "INT"], - C\NS_COMPRESSORS . "GzipCompressor"); + $record_compressor); if (!$read_only_archive) { $this->documents->initCountIfNotExists("VISITED_URLS_COUNT"); } @@ -270,7 +274,7 @@ class IndexDocumentBundle implements CrawlConstants self::DICTIONARY_FOLDER, ["PRIMARY KEY" => ["TERM", 16], "PARTITION" => "INT", "NUM_DOCS" => "INT", "NUM_OCCURRENCES" => "INT", "POSTINGS" => "BLOB"], $max_keys, - C\NS_COMPRESSORS . "GzipCompressor"); + $record_compressor, $blob_compressor); } /** * Add the array of $pages to the documents PartitionDocumentBundle diff --git a/src/library/PackedTableTools.php b/src/library/PackedTableTools.php index 08eb777f5..f13cadd13 100644 --- a/src/library/PackedTableTools.php +++ b/src/library/PackedTableTools.php @@ -424,8 +424,7 @@ class PackedTableTools if (!file_exists($table_path)) { return ($mode == PackedTableTools::AS_STRING_MODE) ? "" : null; } - $pre_table = file_get_contents($table_path); - $table = $this->compressor->uncompress($pre_table); + $table = $this->compressor->uncompressGetFile($table_path); if ($cache_table) { $this->table_cache[$hash_name] = $table; } @@ -605,8 +604,7 @@ class PackedTableTools } $out = substr($out, 1); } - $out = $this->compressor->compress($out); - return (file_put_contents($table_path, $out) > 0); + return ($this->compressor->compressPutFile($table_path, $out) > 0); } /** * Given a table_row, which might represent several items grouped because diff --git a/src/library/PartitionDocumentBundle.php b/src/library/PartitionDocumentBundle.php index 6781f17c4..69a711577 100644 --- a/src/library/PartitionDocumentBundle.php +++ b/src/library/PartitionDocumentBundle.php @@ -63,7 +63,8 @@ class PartitionDocumentBundle /** * Default parameters to use when constructing a PartitionDocumentBundle */ - const DEFAULT_PARAMETERS = ["COMPRESSOR" => self::DEFAULT_COMPRESSOR, + const DEFAULT_PARAMETERS = ["RECORD_COMPRESSOR" => self::DEFAULT_COMPRESSOR, + "BLOB_COMPRESSOR" => self::DEFAULT_COMPRESSOR, "COUNT" => 0, "PARTITION_SIZE_THRESHOLD" => self::PARTITION_SIZE_THRESHOLD, "FORMAT" => ["PRIMARY KEY" => "KEY", "VALUE" => "BLOB"], @@ -116,10 +117,16 @@ class PartitionDocumentBundle public $blob_columns; /** * The seekquarry\yioop\library\compressors\Compressor object used to - * compress record files and blob items. + * compress record files. * @var object */ - public $compressor; + public $record_compressor; + /** + * The seekquarry\yioop\library\compressors\Compressor object used to + * compress blob columns. + * @var object + */ + public $blob_compressor; /** * Folder path where the PartitionDocumentBundle is stored * @var string @@ -175,22 +182,28 @@ class PartitionDocumentBundle * in a partition before making the next partition * @param int $partition_size_threshold maximum length of a partition * file in bytes before a new partition file should be started - * @param object $compressor_type + * @param object $record_compressor_type + * seekquarry\yioop\library\compressors\Compressor object used to + * compress record files excluding blob columns. + * @param object $blob_compressor_type * seekquarry\yioop\library\compressors\Compressor object used to - * compress record files and blob items. + * compress blob columns. */ public function __construct($folder, $format = self::DEFAULT_PARAMETERS["FORMAT"], $max_items_per_file = self::MAX_ITEMS_PER_FILE, $partition_size_threshold = self::PARTITION_SIZE_THRESHOLD, - $compressor_type = self::DEFAULT_COMPRESSOR) + $record_compressor_type = self::DEFAULT_COMPRESSOR, + $blob_compressor_type = self::DEFAULT_COMPRESSOR) { $initial_parameters = self::DEFAULT_PARAMETERS; $initial_parameters["PARTITION_SIZE_THRESHOLD"] = $partition_size_threshold; $initial_parameters["MAX_ITEMS_PER_FILE"] = $max_items_per_file; - $initial_parameters["COMPRESSOR"] = $compressor_type; - $this->compressor = new $compressor_type(); + $initial_parameters["RECORD_COMPRESSOR"] = $blob_compressor_type; + $initial_parameters["BLOB_COMPRESSOR"] = $blob_compressor_type; + $this->record_compressor = new $record_compressor_type(); + $this->blob_compressor = new $blob_compressor_type(); $initial_parameters["FORMAT"] = $format; $this->instance_time = hrtime(true); $this->index_cache_size = min(50, floor(metricToInt( @@ -246,7 +259,7 @@ class PartitionDocumentBundle $packed_table_format["LAST_BLOB_LEN"] = "INT"; } $this->table_tools = new PackedTableTools($packed_table_format, - $compressor_type); + $record_compressor_type); if ($changed_parameters) { $this->saveParameters(); } @@ -345,7 +358,7 @@ class PartitionDocumentBundle { list($fh, $previous_archive_filename, $previous_instance_time) = $this->get_archive_cache; - $compressor = $this->compressor; + $blob_compressor = $this->blob_compressor; if (!$fh || $previous_archive_filename != $archive_filename || $previous_instance_time != $this->instance_time) { if ($fh) { @@ -358,7 +371,7 @@ class PartitionDocumentBundle $value = false; if (fseek($fh, $offset) == 0) { $compressed_file = fread($fh, $len); - $value = $compressor->uncompress($compressed_file); + $value = $blob_compressor->uncompress($compressed_file); } $this->get_archive_cache = [$fh, $previous_archive_filename, $previous_instance_time]; @@ -374,7 +387,7 @@ class PartitionDocumentBundle public function getPartition($i) { return $this->folder . "/" . self::PARTITION_PREFIX . - $i . $this->compressor->fileExtension(); + $i . $this->blob_compressor->fileExtension(); } /** * Returns the path to the index file (used to store all columns @@ -609,7 +622,13 @@ class PartitionDocumentBundle { $parameter_path = $folder . "/" . self::PARAMETERS_FILE; if(file_exists($parameter_path)) { - return unserialize(file_get_contents($parameter_path)) ?? []; + $parameters = unserialize(file_get_contents($parameter_path)) ?? []; + if (!empty($parameters["COMPRESSOR"])) { + //original format didn't distinguish between compressor use + $parameters["RECORD_COMPRESSOR"] ??= $parameters["COMPRESSOR"]; + $parameters["BLOB_COMPRESSOR"] ??= $parameters["COMPRESSOR"]; + } + return $parameters; } else { return []; } @@ -625,7 +644,7 @@ class PartitionDocumentBundle { list($fh, $previous_partition_filename, $previous_instance_time) = $this->add_archive_cache; - $compressor = $this->compressor; + $blob_compressor = $this->blob_compressor; $save_partition = $this->parameters["SAVE_PARTITION"]; $partition_filename = $this->getPartition($save_partition); if (!is_resource($fh) || @@ -640,7 +659,7 @@ class PartitionDocumentBundle } fseek($fh, 0, SEEK_END); $offset = ftell($fh); - $compressed_value = $compressor->compress($value); + $compressed_value = $blob_compressor->compress($value); $len = strlen($compressed_value); fwrite($fh, $compressed_value, $len); $this->add_archive_cache = [$fh, $previous_partition_filename, diff --git a/src/library/compressors/Compressor.php b/src/library/compressors/Compressor.php index e259cf482..ac43bc512 100755 --- a/src/library/compressors/Compressor.php +++ b/src/library/compressors/Compressor.php @@ -42,21 +42,37 @@ require_once __DIR__."/../../configs/Config.php"; interface Compressor { /** - * Applies the Compressor compress filter to a string before it is - * inserted into a WebArchive. + * Applies the Compressor compress filter to a string and returns the + * resultsting string. * * @param string $str string to apply filter to * @return string the result of applying the filter */ public function compress($str); /** - * Used to unapply the compress filter as when data is read out of a - * WebArchive. + * Applies the Compressor compress filter to a string $str and + * then writes it to file $file_name + * + * @param string $file_name to write string to + * @param string $str string to apply filter to + * @return int the number of bytes written + */ + public function compressPutFile($file_name, $str); + /** + * Used to unapply the compress filter to data in a string $string * * @param string $str data read from a string archive * @return string result of uncompressing */ public function uncompress($str); + /** + * Applies the Compressor uncompress filter to the contents read + * from $file_name and returns the result as a string + * + * @param string $file_name to write string to + * @return string the uncompressed contents of the $file_name + */ + public function uncompressGetFile($file_name); /** * Used to compress an int as a fixed length string in the format of * the compression algorithm underlying the compressor. diff --git a/src/library/compressors/GzipCompressor.php b/src/library/compressors/GzipCompressor.php index af2e77f21..e2f6622a8 100755 --- a/src/library/compressors/GzipCompressor.php +++ b/src/library/compressors/GzipCompressor.php @@ -46,8 +46,8 @@ class GzipCompressor implements Compressor */ public function __construct() {} /** - * Applies the Compressor compress filter to a string before it is inserted - * into a WebArchive. In this case, applying the filter means gzipping. + * Applies the Compressor compress filter to a string and returns the + * resultsting string. In this case, applying the filter means gzipping. * * @param string $str string to apply filter to * @return string the result of applying the filter @@ -56,6 +56,19 @@ class GzipCompressor implements Compressor { return gzencode($str, 9); } + /** + * Applies the Compressor gzip filter to a string $str and + * then writes it to file $file_name + * + * @param string $file_name to write string to + * @param string $str string to apply filter to + * @return int the number of bytes written + */ + public function compressPutFile($file_name, $str) + { + $gz_str = gzencode($str, 9); + return file_put_contents($file_name, $gz_str); + } /** * Used to unapply the compress filter as when data is read out of a * WebArchive. In this case, unapplying the filter means gunzipping. @@ -67,7 +80,21 @@ class GzipCompressor implements Compressor { $skip_header_string = substr($str, 10);// 10 bytes to skip gzip header return (!empty($skip_header_string)) ? - gzinflate($skip_header_string) : ""; + gzinflate($skip_header_string) : ""; + } + /** + * Applies the Compressor uncompress filter to the contents read + * from $file_name and returns the result as a string + * + * @param string $file_name to write string to + * @return string the uncompressed contents of the $file_name + */ + public function uncompressGetFile($file_name) + { + $str = file_get_contents($file_name); + $skip_header_string = substr($str, 10);// 10 bytes to skip gzip header + return (!empty($skip_header_string)) ? + gzinflate($skip_header_string) : ""; } /** * Used to compress an int as a fixed length string in the format of diff --git a/src/library/compressors/NonCompressor.php b/src/library/compressors/NonCompressor.php index acf27ee57..fc9273616 100755 --- a/src/library/compressors/NonCompressor.php +++ b/src/library/compressors/NonCompressor.php @@ -56,6 +56,18 @@ class NonCompressor implements Compressor { return $str; } + /** + * Applies the Compressor gzip filter to a string $str and + * then writes it to file $file_name + * + * @param string $file_name to write string to + * @param string $str string to apply filter to + * @return int the number of bytes written + */ + public function compressPutFile($file_name, $str) + { + return file_put_contents($file_name, $str); + } /** * Used to unapply the compress filter as when data is read out of a * WebArchive. In this case, the unapplying filter does nothing. @@ -67,6 +79,17 @@ class NonCompressor implements Compressor { return $str; } + /** + * Applies the Compressor uncompress filter to the contents read + * from $file_name and returns the result as a string + * + * @param string $file_name to write string to + * @return string the uncompressed contents of the $file_name + */ + public function uncompressGetFile($file_name) + { + return file_get_contents($file_name); + } /** * Used to compress an int as a fixed length string in the format of * the compression algorithm underlying the compressor. Since this diff --git a/src/locale/en_US/resources/all_aux_grams.txt b/src/locale/en_US/resources/all_aux_grams.txt index 2593797db..11f3c297f 100755 --- a/src/locale/en_US/resources/all_aux_grams.txt +++ b/src/locale/en_US/resources/all_aux_grams.txt @@ -2,6 +2,7 @@ governor general governor generals lieutenant governor lieutenant governors +prime minister prime ministers executive power executive powers diff --git a/src/locale/en_US/resources/all_word_grams.ftr b/src/locale/en_US/resources/all_word_grams.ftr index c9abdf780..8f6015439 100644 Binary files a/src/locale/en_US/resources/all_word_grams.ftr and b/src/locale/en_US/resources/all_word_grams.ftr differ diff --git a/src/models/ParallelModel.php b/src/models/ParallelModel.php index 9f3c8804f..4ce56f3ad 100755 --- a/src/models/ParallelModel.php +++ b/src/models/ParallelModel.php @@ -436,7 +436,7 @@ class ParallelModel extends Model !isset($index_archive->generation_info['ACTIVE'])) { return false; } - if ($index_version < 3 ) { + if ($index_version < 3) { $num_generations = $index_archive->generation_info['ACTIVE']; } $add_info = (strncmp($url_or_key, "info:", 5) == 0) ? "" : @@ -447,7 +447,7 @@ class ParallelModel extends Model if (!isset($info[0][4]) && empty($info['ROWS'][0])) { return false; } - $term_id = ($index_version < 3 ) ? $info[0][4] : $hash_key; + $term_id = ($index_version < 3) ? $info[0][4] : $hash_key; if (!empty($info['ROWS'][0])) { $generation = $info['ROWS'][0]['PARTITION']; } @@ -457,7 +457,7 @@ class ParallelModel extends Model if (!$doc_info) { return false; } - $summary_offset = ($index_version < 3 ) ? + $summary_offset = ($index_version < 3) ? $doc_info[self::SUMMARY_OFFSET] : $doc_info[self::KEY]; $generation = $doc_info[self::GENERATION]; } else { diff --git a/src/views/TestsView.php b/src/views/TestsView.php index ea7af7b72..b45fc8bf6 100644 --- a/src/views/TestsView.php +++ b/src/views/TestsView.php @@ -147,9 +147,9 @@ class TestsView extends View } $test_title = $data['TEST_NAME']; $current_test_url = "?activity=runTest&test=$test_title"; + $test_title = "<a href='$current_test_url'>$test_title</a>"; if (!empty($data['METHOD'])) { - $test_title = "<a href='$current_test_url'>$test_title</a>:" . - $data['METHOD']; + $test_title .= ":" . $data['METHOD']; } ?> <h2><?=$test_title ?></h2><?php diff --git a/tests/IndexDocumentBundleTest.php b/tests/IndexDocumentBundleTest.php index e53c2ab17..d47c770b7 100644 --- a/tests/IndexDocumentBundleTest.php +++ b/tests/IndexDocumentBundleTest.php @@ -214,7 +214,7 @@ use seekquarry\yioop\library\UnitTest; [ CC::DESCRIPTION => "Take me out to the ball game...", CC::HASH => str_pad("2", 8, "0", STR_PAD_LEFT), - CC::TITLE => "A History of Baseball now and then", + CC::TITLE => "A Dialog on Baseball for people now", CC::URL => "https://www.somewhere2.com/" ], CC::PAGE => "Page 2", diff --git a/tests/IndexManagerTest.php b/tests/IndexManagerTest.php index 98589b4b1..a91886e50 100644 --- a/tests/IndexManagerTest.php +++ b/tests/IndexManagerTest.php @@ -112,7 +112,7 @@ use seekquarry\yioop\library\UnitTest; $version_new = IndexManager::getVersion(self::TEST_DIR . "/". self::NEW_BUNDLE); $this->assertEqual($version_old, 1, "Version 1 index detected"); - $this->assertEqual($version_new, 3, "Version 3 index detected"); + $this->assertEqual($version_new, 3.1, "Version 3.1 index detected"); } /** * Tests if IndexManager can return the dictionary information about a