diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php index 157a03006..96a190e71 100644 --- a/src/controllers/components/CrawlComponent.php +++ b/src/controllers/components/CrawlComponent.php @@ -2287,7 +2287,7 @@ class CrawlComponent extends Component implements CrawlConstants $data["URL_ACTION"] = -1; } if ($data["URL"] != "") { - $data["URL"] = UrlParser::canonicalLink($data["URL"], ""); + $data["URL"] = UrlParser::canonicalLink($data["URL"], "", false); if ($data["URL_ACTION"] == C\SEARCH_FILTER_GROUP_ITEM) { $data["URL"] = UrlParser::getHost($data["URL"]); } diff --git a/src/controllers/components/SystemComponent.php b/src/controllers/components/SystemComponent.php index 26935e562..e4d7e8e29 100755 --- a/src/controllers/components/SystemComponent.php +++ b/src/controllers/components/SystemComponent.php @@ -1375,7 +1375,7 @@ EOD; $comma = ",<br />"; } } - if (!defined('PHP_VERSION_ID') || PHP_VERSION_ID < 70400) { + if (!defined('PHP_VERSION_ID') || PHP_VERSION_ID < 80800) { $missing_required .= $comma . tl("system_component_php_version"); $comma = ", "; } diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php index 8beeafb39..ca9897095 100755 --- a/src/executables/ArcTool.php +++ b/src/executables/ArcTool.php @@ -1185,6 +1185,9 @@ EOD; $next_partition = $start_generation; $continue = false; $dictionary_log = C\LOG_DIR . "/0-DictionaryUpdater.log"; + if (file_exists($dictionary_log)) { + file_put_contents($dictionary_log, ""); + } while ($next_partition < $save_partition) { if ($old_next_partition != $next_partition) { $old_next_partition = $next_partition; diff --git a/src/library/BPlusTree.php b/src/library/BPlusTree.php index 0b99d4d96..19ea997e3 100644 --- a/src/library/BPlusTree.php +++ b/src/library/BPlusTree.php @@ -304,8 +304,8 @@ class BPlusTree $table_tools->load($insert_node_path, $mode) ?? []; } $insert_node = $this->insert_node_cache[$insert_node_path]; - $archive_filename = $this->archiveFilenameFromNodeFilename( - $insert_node_path); + $archive_filename = (empty($this->blob_columns)) ? "" : + $this->archiveFilenameFromNodeFilename($insert_node_path); $this->putNode($row, $insert_node, $archive_filename, $is_encoded_key, $mode); if (count($insert_node) > $this->parameters["MAX_KEYS"]) { @@ -509,8 +509,8 @@ class BPlusTree $node_prefix = self::NODE_PREFIX; $tmp_filename = "$parent_folder/$temp_node_name"; $tmp_archive_filename = "$parent_folder/$archive_prefix$temp_node_name"; - $archive_filename = $this->archiveFilenameFromNodeFilename( - $node_path); + $archive_filename = (empty($this->blob_columns)) ? "" : + $this->archiveFilenameFromNodeFilename($node_path); $num_keys = count($node); $half_num = ceil($num_keys/2); $keys = array_keys($node); @@ -545,7 +545,9 @@ class BPlusTree $this->add_archive_cache = [null, "", -1]; $this->get_archive_cache = [null, "", -1]; rename($tmp_filename, $node_path); - rename($tmp_archive_filename, $archive_filename); + if (!empty($this->blob_columns)) { + rename($tmp_archive_filename, $archive_filename); + } } /** * Returns the record associated with a $key as stored in the BPlusTree. @@ -573,11 +575,11 @@ class BPlusTree { $table_tools = $this->table_tools; $key_node_filename = $this->find($key, $is_encoded_key); - $archive_filename = $this->archiveFilenameFromNodeFilename( - $key_node_filename); if (!$key_node_filename) { return null; } + $archive_filename = (empty($this->blob_columns)) ? "" : + $this->archiveFilenameFromNodeFilename($key_node_filename); if ($use_string_node) { $key_node = $table_tools->load($key_node_filename, $table_tools::AS_STRING_MODE, true); @@ -629,7 +631,8 @@ class BPlusTree return $values; } if (!($values = $table_tools->unpack($values, $offset, $limit))) { - crawlLog("Unpack BPlusTree error!!! Key was:$key .."); + crawlLog("Unpack BPlusTree error!!! (Key,offset,limit) was: ". + "($key, $offset, $limit) .."); $value_message = (is_string($values)) ? toHexString($values) : serialize($values); crawlLog(".. value was:" . $value_message); @@ -639,27 +642,28 @@ class BPlusTree if (!$look_up_blobs) { return $values; } - if (!empty($this->blob_columns)) { - $num_blob_columns = count($this->blob_columns); - for ($k = 0; $k < $num_unpacked; $k++) { - $offset = intval($values[$k][$this->blob_columns[0]]); + $num_blob_columns = count($this->blob_columns); + for ($k = 0; $k < $num_unpacked; $k++) { + if ($num_blob_columns > 0) { + $blob_offset = intval($values[$k][$this->blob_columns[0]]); for ($i = 0; $i < $num_blob_columns; $i++) { $column_name = $this->blob_columns[$i]; $len = ($i + 1 < $num_blob_columns) ? intval($values[$k][$this->blob_columns[$i + 1]]) : $values[$k]["LAST_BLOB_LEN"]; $values[$k][$column_name] = ($len == 0) ? "" : - $this->getArchive($archive_filename, $offset, $len); - $offset += $len; + $this->getArchive($archive_filename, $blob_offset, + $len); + $blob_offset += $len; } unset($values[$k]["LAST_BLOB_LEN"]); foreach ($this->serial_columns as $field_name) { $values[$k][$field_name] = unserialize( $values[$k][$field_name]); } - $values[$k][$this->key_field] = ($is_encoded_key) ? - $key : rawurldecode($encode_key); } + $values[$k][$this->key_field] = ($is_encoded_key) ? + $key : rawurldecode($encode_key); } return $values; } diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php index c5d280b8d..27739bd8a 100644 --- a/src/library/IndexDocumentBundle.php +++ b/src/library/IndexDocumentBundle.php @@ -54,7 +54,7 @@ class IndexDocumentBundle implements CrawlConstants * The version of this IndexDocumentBundle. The lowest format number is * 3.0 as prior inverted index/document stores used IndexArchiveBundle's */ - const DEFAULT_VERSION = "3.1"; + const DEFAULT_VERSION = "3.2"; /** * Default values for the configuration parameters of an * IndexDocumentBundle @@ -131,6 +131,16 @@ class IndexDocumentBundle implements CrawlConstants * term. */ const POSTINGS_FILENAME = "postings"; + /** + * Temporary name for postings from a POSTINGS_FILENAME file while + * they are being compressed. + */ + const TEMP_POSTINGS_FILENAME = "temp_postings"; + /** + * How many bytes of posting to buffer before writing, when + * addPartitionPostingsDictionary + */ + const POSTINGS_BUFFER_SIZE = 1000000; /** * Name of the folder used to hold position lists and document maps. Within * this folder there is a subfolder for each partition which contains a @@ -313,11 +323,20 @@ class IndexDocumentBundle implements CrawlConstants if (!$read_only_archive) { $this->documents->initCountIfNotExists("VISITED_URLS_COUNT"); } - $this->dictionary = new BPlusTree($this->dir_name . "/" . - self::DICTIONARY_FOLDER, ["PRIMARY KEY" => ["TERM", 16], - "PARTITION" => "INT", "NUM_DOCS" => "INT", - "NUM_OCCURRENCES" => "INT", "POSTINGS" => "BLOB"], $max_keys, - $record_compressor, $bplus_blob_compressor); + if ($this->archive_info['VERSION'] < "3.2") { + $this->dictionary = new BPlusTree($this->dir_name . "/" . + self::DICTIONARY_FOLDER, ["PRIMARY KEY" => ["TERM", 16], + "PARTITION" => "INT", "NUM_DOCS" => "INT", + "NUM_OCCURRENCES" => "INT", "POSTINGS" => "BLOB"], $max_keys, + $record_compressor, $bplus_blob_compressor); + } else { + $this->dictionary = new BPlusTree($this->dir_name . "/" . + self::DICTIONARY_FOLDER, ["PRIMARY KEY" => ["TERM", 16], + "PARTITION" => "INT", "NUM_DOCS" => "INT", + "NUM_OCCURRENCES" => "INT", "POSTINGS_OFFSET" => "INT", + "POSTINGS_LEN" => "INT"], $max_keys, + $record_compressor, $bplus_blob_compressor); + } } /** * Add the array of $pages to the documents PartitionDocumentBundle @@ -428,11 +447,32 @@ class IndexDocumentBundle implements CrawlConstants } crawlLog("Start Adding Partition Posting Info to Dictionary"); $start_time = microtime(true); - $this->postings = $postings_tools->load($postings_filename); + $markers = $postings_tools->getEntryMarkers($postings_filename); + $hash_postings_name = crawlHash($postings_filename); + $postings_string = $postings_tools->table_cache[$hash_postings_name]; + $temp_postings_filename = $base_folder . "/" . + self::TEMP_POSTINGS_FILENAME; + rename($postings_filename, $temp_postings_filename); + unset($postings_tools->table_cache[$hash_postings_name]); + $posting_files_len = strlen($postings_string); + //add a marker for the end of the file as a string + $key_len = $this->postings_tools->key_len; $this->last_entries = $last_entries_tools->load($last_entries_filename); - $num_postings = count($this->postings); - $i = 0; - foreach ($this->postings as $term => $entry) { + $num_postings = count($markers) + 1; + $last_marker = 0; + $out_postings = ""; + $postings_offset = 0; + $fh = fopen($postings_filename, "w"); + for ($i = 0; $i < $num_postings; $i++) { + $cur_marker = $markers[$i] ?? null; + $diff = ($cur_marker === null) ? $cur_marker : + $cur_marker - $last_marker; + $pre_row = substr($postings_string, $last_marker, $diff); + $last_marker = $cur_marker + 1; + $term = substr($pre_row, 0, $key_len); + $row = decode255(substr($pre_row, $key_len)); + $postings_len = strlen($row); + $out_postings .= $row; if(crawlTimeoutLog("..Indexer Still processing partition ". "$partition. Have completed $i postings of $num_postings.") && $taking_too_long_touch) { @@ -441,7 +481,7 @@ class IndexDocumentBundle implements CrawlConstants } } $start = 0; - $num_docs_term = vByteDecode($entry, $start); + $num_docs_term = vByteDecode($row, $start); $num_occurrences_term = 0; $last_entry = $last_entries_tools->find($this->last_entries, $term); if (!empty($last_entry)) { @@ -452,18 +492,22 @@ class IndexDocumentBundle implements CrawlConstants $dictionary->put(["TERM" => $term, "PARTITION" => $partition, "NUM_DOCS" => $num_docs_term, "NUM_OCCURRENCES" => $num_occurrences_term, - "POSTINGS" => $entry]); - $i++; + "POSTINGS_OFFSET" => $postings_offset, + "POSTINGS_LEN" => $postings_len]); + $postings_offset += $postings_len; + if (strlen($out_postings) > self::POSTINGS_BUFFER_SIZE) { + fwrite($fh, $out_postings); + $out_postings = ""; + } } $dictionary->flushLastPutNode(); + fwrite($fh, $out_postings); + fclose($fh); + unlink($temp_postings_filename); crawlLog("...Finished Adding Partition Posting Info to " . "Dictionary: " . changeInMicrotime($start_time)); if (!C\nsdefined("KEEP_PARTITION_CALCULATIONS") || !C\KEEP_PARTITION_CALCULATIONS) { - crawlLog("Deleting partition posting calculations.."); - if (file_exists($postings_filename)) { - unlink($postings_filename); - } if (file_exists($last_entries_filename)) { unlink($last_entries_filename); } @@ -692,6 +736,7 @@ class IndexDocumentBundle implements CrawlConstants return $statistics; } $doc_map_tools->save($doc_map_filename, $this->doc_map); + ksort($this->postings); $postings_tools->save($postings_filename, $this->postings); $last_entries_tools->save($last_entries_filename, $this->last_entries); file_put_contents($positions_filename, $this->positions); @@ -1264,6 +1309,34 @@ class IndexDocumentBundle implements CrawlConstants $result['TOTAL_OCCURRENCES'] = $occurrence_count; return $result; } + /** + * Get the postings stored in the postings file in a partition from + * $offset to $offset+len remove the 255 encoding. + * + * @param int $partition partition to retrieve posting from + * @param int $offset byte offset int partition/postings file to look for + * them + * @param int $len length of the posting list to retrieve. + * @return string encoded posting list data -- vbyte encoded number of + * postings, followed by the posting data in PacktableTools format + */ + public function getPostingsString($partition, $offset, $len) + { + static $file_handles = []; + if (empty($file_handles[$partition])) { + $postings_filename = $this->getPartitionBaseFolder($partition) . + "/" . IndexDocumentBundle::POSTINGS_FILENAME; + $fh = fopen($postings_filename , "r"); + $file_handles[$partition] = $fh; + } else { + $fh = $file_handles[$partition]; + } + if ($fh && fseek($fh, $offset) == 0 && $len > 0) { + $out = fread($fh, $len); + return $out; + } + return ""; + } /** * Given the postings as a string for a partition for a term unpacks them * into an array of postings, doing de-delta of doc_map_indices and @@ -1286,6 +1359,7 @@ class IndexDocumentBundle implements CrawlConstants if (empty($postings_string)) { return []; } + $items = []; $sum_frequencies = 0; $doc_map_index = 0; $positions_offset = 0; @@ -1302,6 +1376,10 @@ class IndexDocumentBundle implements CrawlConstants $sum_frequencies += $pre_item["FREQUENCY"]; $current_pos += $unpack_len_map[$int_info]; $items[] = $item; + if ($current_pos >= strlen($postings_string)) { + crawlLog("Posting decode error"); + break; //sanity check break + } } return [$items, $sum_frequencies]; } diff --git a/src/library/PackedTableTools.php b/src/library/PackedTableTools.php index 232e3c07b..7c8757dfd 100644 --- a/src/library/PackedTableTools.php +++ b/src/library/PackedTableTools.php @@ -373,6 +373,7 @@ class PackedTableTools if (!empty($this->table_entry_markers[$hash_name])) { return $this->table_entry_markers[$hash_name]; } + $this->table_entry_markers[$hash_name] = []; $table_string = $this->load($table_name, self::AS_STRING_MODE, true); $delim = "\xFF"; $this->table_entry_markers[$hash_name] = []; @@ -381,7 +382,7 @@ class PackedTableTools $this->table_entry_markers[$hash_name] = array_column($matches[0], 1); } - return $matches[0]; + return $this->table_entry_markers[$hash_name]; } /** * Returns the $index'th entry out of a string packed according to the diff --git a/src/library/PartitionDocumentBundle.php b/src/library/PartitionDocumentBundle.php index 8b7a72eeb..598d950c4 100644 --- a/src/library/PartitionDocumentBundle.php +++ b/src/library/PartitionDocumentBundle.php @@ -273,7 +273,7 @@ class PartitionDocumentBundle * @param int $partition to look for record in * @param array $fields names of fields in this PartitionDocumentBundle * to return - * @return array unpacked record on success, otherwise false + * @return array|false unpacked record on success, otherwise false */ public function get($key, $partition, $fields = []) { diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php index 85b5d1265..da7a54ef6 100644 --- a/src/library/index_bundle_iterators/WordIterator.php +++ b/src/library/index_bundle_iterators/WordIterator.php @@ -927,16 +927,29 @@ class WordIterator extends IndexBundleIterator return []; } $generation_info = $this->dictionary_info[$generation]; - if (is_array($generation_info['POSTINGS'])) { + if (!empty($generation_info['POSTINGS']) && + is_array($generation_info['POSTINGS'])) { return $generation_info['POSTINGS']; //already loaded } $index = IndexManager::getIndex($this->index_name); - if (empty($generation_info['LAST_BLOB_LEN'])) { - $postings_entry = ""; + if ($this->index_version < "3.2") { + if (empty($generation_info['LAST_BLOB_LEN'])) { + $postings_entry = ""; + } else { + $postings_entry = $index->dictionary->getArchive( + $this->archive_file, $generation_info['POSTINGS'], + $generation_info['LAST_BLOB_LEN']); + unset($this->dictionary_info[$generation]['LAST_BLOB_LEN']); + } } else { - $postings_entry = $index->dictionary->getArchive( - $this->archive_file, $generation_info['POSTINGS'], - $generation_info['LAST_BLOB_LEN']); + if (empty($generation_info['POSTINGS_OFFSET']) || + empty($generation_info['POSTINGS_LEN'])) { + $postings_entry = ""; + } else { + $postings_entry = $index->getPostingsString($generation, + $generation_info['POSTINGS_OFFSET'], + $generation_info['POSTINGS_LEN']); + } } if (empty($postings_entry)) { $postings = []; @@ -944,7 +957,6 @@ class WordIterator extends IndexBundleIterator list($postings,) = $index->unpackPostings($postings_entry); } $this->dictionary_info[$generation]['POSTINGS'] = $postings; - unset($this->dictionary_info[$generation]['LAST_BLOB_LEN']); return $postings; } /** @@ -979,8 +991,8 @@ class WordIterator extends IndexBundleIterator $partition_info = $this->dictionary_info[$this->generation_pointer]; $this->current_generation = $partition_info['PARTITION']; $postings = $this->getGenerationPostings($this->generation_pointer); - $this->current_doc_offset = - $postings[$this->current_offset]['DOC_MAP_INDEX']; + $this->current_doc_offset = ($postings) ? + $postings[$this->current_offset]['DOC_MAP_INDEX'] : -1; } return [$this->current_generation, $this->current_doc_offset]; } diff --git a/src/library/processors/ImageProcessor.php b/src/library/processors/ImageProcessor.php index 47220acb6..e3bf43d28 100755 --- a/src/library/processors/ImageProcessor.php +++ b/src/library/processors/ImageProcessor.php @@ -143,7 +143,7 @@ class ImageProcessor extends PageProcessor public static function createThumb($image, $width = C\THUMB_DIM, $height = C\THUMB_DIM) { - if (empty($image)) { + if (empty($image) || ($width == 0 && $height == 0)) { return ""; } $size_x = imagesx($image); diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index 316f6f264..4d3f381e9 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -354,7 +354,7 @@ class PhraseModel extends ParallelModel L\guessLocaleFromString($disjunct_phrases[0])); $map_cnt = 0; foreach ($query_map as $map_url) { - $map_parts = explode("#", $map_url); + $map_parts = explode("###", $map_url); $map_result[self::URL] = $map_parts[0]; $map_result[self::PINNED] = true; $map_result[self::SCORE] = 0; diff --git a/tests/BPlusTreeTest.php b/tests/BPlusTreeTest.php index d8e00a959..0b5c15e19 100644 --- a/tests/BPlusTreeTest.php +++ b/tests/BPlusTreeTest.php @@ -91,7 +91,34 @@ use seekquarry\yioop\library\UnitTest; * Test putting items in bplustrees of odd sized nodes between 3 and 13 and * then seeing if the items can be retrieved */ - public function putGetTestCase() + public function putGetTextTestCase() + { + $format = ["PRIMARY KEY" => ["KEY", -1], "VALUE" => "TEXT"]; + for ($i = 3; $i <= 13; $i += 2) { + $bptree = $this->createTree($i, $format); + for ($j = 0; $j < ($i * 40); $j++) { + for($k = 0; $k < 5; $k++) { + $bptree->put(["KEY" => str_pad("$j",4,"0", STR_PAD_LEFT), + "VALUE" => "row{$j}_{$k}"], + PackedTableTools::APPEND_MODE); + } + } + $bptree->flushLastPutNode(); + for ($j = 0; $j < ($i * 40); $j++) { + $rows = $bptree->get(str_pad("$j",4,"0", STR_PAD_LEFT)); + for($k = 0; $k < 5; $k++) { + $this->assertEqual("row{$j}_{$k}", $rows[$k]["VALUE"], + "{$j}th insert into tree of size $i was retrieved okay". + " {$k}th case"); + } + } + } + } + /** + * Test putting items in bplustrees of odd sized nodes between 3 and 13 and + * then seeing if the items can be retrieved + */ + public function putGetBlobTestCase() { for ($i = 3; $i <= 13; $i += 2) { $bptree = $this->createTree($i); diff --git a/tests/IndexDocumentBundleTest.php b/tests/IndexDocumentBundleTest.php index d06fa31ad..515145368 100644 --- a/tests/IndexDocumentBundleTest.php +++ b/tests/IndexDocumentBundleTest.php @@ -315,7 +315,15 @@ use seekquarry\yioop\library\UnitTest; $this->assertEqual($sum + count($active_postings), $num_docs, "Term 'be' occurs in correct number of documents"); for ($i = 0; $i < 2; $i++) { - $postings = $posting_tools->unpack($term_row[$i]['POSTINGS']); + $partition = $term_row[$i]['PARTITION']; + $partition_folder = $this->index_archive->getPartitionBaseFolder( + $partition); + $postings_filename = $partition_folder . "/" . + IndexDocumentBundle::POSTINGS_FILENAME; + $postings_string = file_get_contents($postings_filename, + false, null, $term_row[$i]['POSTINGS_OFFSET'], + $term_row[$i]['POSTINGS_LEN']); + $postings = $posting_tools->unpack($postings_string); $base_folder = $this->index_archive->getPartitionBaseFolder( $term_row[$i]['PARTITION']); $positions_filename = $base_folder . "/" . diff --git a/tests/IndexManagerTest.php b/tests/IndexManagerTest.php index a91886e50..daea12d8a 100644 --- a/tests/IndexManagerTest.php +++ b/tests/IndexManagerTest.php @@ -111,8 +111,8 @@ use seekquarry\yioop\library\UnitTest; self::OLD_BUNDLE); $version_new = IndexManager::getVersion(self::TEST_DIR . "/". self::NEW_BUNDLE); - $this->assertEqual($version_old, 1, "Version 1 index detected"); - $this->assertEqual($version_new, 3.1, "Version 3.1 index detected"); + $this->assertEqual($version_old, "1", "Version 1 index detected"); + $this->assertEqual($version_new, "3.2", "Version 3.2 index detected"); } /** * Tests if IndexManager can return the dictionary information about a @@ -124,19 +124,23 @@ use seekquarry\yioop\library\UnitTest; $index_archive = $this->index_archive; $dictionary = $index_archive->dictionary; $keys = []; + $a = ""; for ($i = 0; $i < 10; $i++) { $keys[$i] = $this->docidFromIntKeys($i, $i, $i); $docs[] = [ CC::DOC_ID => $keys[$i], CC::SUMMARY => [ - CC::DESCRIPTION => "to$i be or$i not$i to$i be...", + CC::DESCRIPTION => "$a to$i be or$i not$i to$i be...", CC::HASH => str_pad("$i", 8, "0", STR_PAD_LEFT), CC::TITLE => "Some$i Shakespeare$i Play$i", CC::URL => "https://www.somewhere$i.com/" ], CC::PAGE => "Page $i", ]; + if ($i > 3) { + $a = "aha be "; + } } $num_docs = count($docs); $index_archive->addPages($docs, $num_docs); @@ -150,10 +154,9 @@ use seekquarry\yioop\library\UnitTest; "Active partition postings has been returned as array."); $index = IndexManager::getIndex(self::TEST_DIR . "/". self::NEW_BUNDLE); - $postings = $index->dictionary->getArchive( - $be_info['ARCHIVE_FILE'], - $be_info['ROWS'][0]['POSTINGS'], - $be_info['ROWS'][0]['LAST_BLOB_LEN']); + $postings = $index->getPostingsString(0, + $be_info['ROWS'][0]['POSTINGS_OFFSET'], + $be_info['ROWS'][0]['POSTINGS_LEN']); $posting_rows = $index->postings_tools->unpack($postings); $this->assertEqual(count($posting_rows), 4, "Able to look up postings for Partition 0 of 'be' Word Info.");