Allows Arctool to rebuild/remerge a range of partitions, fixes term lookup bugs in WordIterator and IndexDocumentBundle

Chris Pollett [2024-02-04 02:Feb:th]
Allows Arctool to rebuild/remerge a range of partitions, fixes term lookup bugs in WordIterator and IndexDocumentBundle
Filename
src/executables/ArcTool.php
src/library/IndexDocumentBundle.php
src/library/LSMTree.php
src/library/PackedTableTools.php
src/library/index_bundle_iterators/GroupIterator.php
src/library/index_bundle_iterators/WordIterator.php
tests/PackedTableToolsTest.php
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php
index 7e298c674..4181aa58c 100755
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
@@ -139,9 +139,7 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants
                 $this->checkFilter($argv[2], $argv[3]);
                 break;
             case "count":
-                if (!isset($argv[3])) {
-                    $argv[3] = false;
-                }
+                $argv[3] ??= false;
                 $this->outputCountBundle($path, $argv[3]);
                 break;
             case "doc-lookup":
@@ -171,51 +169,41 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants
                 $this->outputArchiveList();
                 break;
             case "make-filter":
-                if (!isset($argv[4])) {
-                    $argv[4] = -1;
-                }
+                $argv[4] ??= -1;
                 $this->makeFilter($argv[2], $argv[3], $argv[4]);
                 break;
             case "migrate":
-                if (!isset($argv[3])) {
-                    $argv[3] = 1;
-                }
+                $argv[3] ??= 1;
                 $this->migrateIndexArchive($path, $argv[3]);
                 break;
             case "partition":
+                $argv[3] ??= 0;
                 $this->outputPartitionInfo($path, $argv[3]);
                 break;
             case "fix-partition":
-                if (!isset($argv[3])) {
-                    $argv[3] = 0;
-                }
-                if (!isset($argv[4])) {
-                    $argv[4] = -1;
-                }
+                $argv[3] ??= 0;
+                $argv[4] ??= -1;
                 $this->fixPartitionIndexes($path, $argv[3], $argv[4]);
                 break;
             case "rebuild":
-                if (!isset($argv[3])) {
-                    $argv[3] = 0;
-                }
-                if (!isset($argv[4])) {
-                    $argv[4] = 1;
-                }
-                $this->rebuildIndexBundle($path, $argv[3], true, $argv[4]);
+                $argv[3] ??= 0;
+                $argv[4] ??= 1;
+                $argv[5] ??= -1;
+                $this->rebuildIndexBundle($path, $argv[3], true, $argv[4],
+                    $argv[5]);
                 break;
             case "remerge":
-                if (!isset($argv[3])) {
-                    $argv[3] = 0;
-                }
-                $this->rebuildIndexBundle($path, $argv[3], false);
+                $argv[3] ??= 0;
+                $argv[4] ??= 1;
+                $argv[5] ??= -1;
+                $this->rebuildIndexBundle($path, $argv[3], false, $argv[4],
+                    $argv[5]);
                 break;
             case "show":
                 if (!isset($argv[3])) {
                     $this->usageMessageAndExit();
                 }
-                if (!isset($argv[4])) {
-                    $argv[4] = 1;
-                }
+                $argv[4] ??= 1;
                 $this->outputShowPages($path, $argv[3], $argv[4]);
                 break;
             default:
@@ -1394,9 +1382,12 @@ EOD;
      *   partition inverted indexes or to try to use existing ones if present
      * @param int $number_of_processes number of CPU processes to use
      *   when trying to recompute partition inverted indexes
+     * @param mixed $last_generation which web archive generation to end
+     *  rebuild at. Default value of -1 goes till last partition
      */
     public function rebuildIndexBundle($archive_path, $start_generation = 0,
-        $force_recompute = true, $number_of_processes = 1)
+        $force_recompute = true, $number_of_processes = 1,
+        $last_generation = -1)
     {
         $rebuilding = ($force_recompute) ? "Rebuilding" : "Remerging";
         $rebuild = ($force_recompute) ? "rebuild" : "remerge";
@@ -1477,12 +1468,14 @@ EOD;
             $recent_log_times[$i] = time();
         }
         $rebuild_dones = [];
-        while ($next_partition < $save_partition) {
+        $end_partition = ($last_generation > 0) ? min($save_partition,
+            $last_generation) : $save_partition;
+        while ($next_partition < $end_partition) {
             if ($old_next_partition != $next_partition) {
                 $old_next_partition = $next_partition;
-                $num_forks = min($save_partition - $next_partition,
+                $num_forks = min($end_partition - $next_partition,
                     $number_of_processes);
-                echo "Num forks:$num_forks, ".
+                echo "Num forks: $num_forks, ".
                     "num processes $number_of_processes\n";
                 for ($i = 0; $i < $num_forks; $i++) {
                     $process_partition = $next_partition + $i;
@@ -1547,7 +1540,8 @@ EOD;
               all partition, then the call below is used to remerge them into
               the global dictionary
               */
-            $this->rebuildIndexBundle($archive_path, $start_generation, false);
+            $this->rebuildIndexBundle($archive_path, $start_generation, false,
+                last_generation: $last_generation);
         }
         echo "\nIndex $rebuild complete!\n";
     }
@@ -1633,12 +1627,12 @@ php ArcTool.php doc-lookup bundle_name partition doc_map_index
        returns the document stored in partition at doc_map_index
        (here doc_map_index is the value that would be stored in a posting)
     */
-php ArcTool.php dict bundle_name word [details]
-php ArcTool.php dict double_index_name which_bundle word [details]
-php ArcTool.php dict bundle_name word start_record num_records [details]
-php ArcTool.php dict double_index_name which_bundle word start_record num_records [details]
+php ArcTool.php dict bundle_info word [details]
+php ArcTool.php dict bundle_info word start_record num_records [details]
     /* returns index dictionary records for word stored in index archive bundle
-       or double index bundle. In the later case you should provide which bundle
+       or double index bundle. bundle_info is either the name of the bundle,
+       or for double index bundle, the name of the bundle whitespace which
+       sub-bundle. In the later case you should provide which bundle
        you want dictionary info for. This command also supports start
        and number of record parameters. If the word details is added to the end
        of the command then additional information about each
@@ -1699,30 +1693,27 @@ php ArcTool.php partition double_index_name which_bundle partition_number
        or double index bundle (in which case need to say either 0 or  1 bundle)
      */

-php ArcTool.php rebuild bundle_name
-php ArcTool.php rebuild double_index_name which_bundle
-php ArcTool.php rebuild bundle_name continue
-php ArcTool.php rebuild bundle_name continue number_of_processes
-php ArcTool.php rebuild double_index_name which_bundle continue
-php ArcTool.php rebuild bundle_name partition_num
-php ArcTool.php rebuild bundle_name partition_num number_of_processes
-php ArcTool.php rebuild double_index_name which_bundle partition_num
+php ArcTool.php rebuild bundle_info
+php ArcTool.php rebuild bundle_info partition_num
+php ArcTool.php rebuild bundle_info partition_num number_of_processes
+php ArcTool.php rebuild bundle_info partition_num number_of_processes end_partition
     /*  re-extracts words from summaries files in bundle_name a partition at a
         time, builds an inverted index for that partition and adds to the global
-        dictionary. If this process crashes the keyword continue can be used
-        to continue from where it left off. If a partition number is supplied
-        process continue from that partition number. The default number of
-        processes that are used for index rebuilding is 1, however,
-        rebuilding is faster if you specify a number_of_processes proportional
-        to the number of CPU cores of your machine.
+        dictionary. bundle_info is either the name of the bundle, or for
+        double index bundle, the name of the bundle whitespace which sub-bundle.
+        If this process crashes the keyword continue can be used
+        to continue from where it left off. If a partition_num is supplied
+        process continue from that partition number. The keyword "continue" can
+        be used to continue from the last processed partitition numebr.
+        The default number of processes that are used for index rebuilding is 1,
+        however, rebuilding is faster if you specify a number_of_processes
+        proportional to the number of CPU cores of your machine.
         */

-php ArcTool.php remerge bundle_name
-php ArcTool.php remerge double_index_name which_bundle
-php ArcTool.php remerge bundle_name continue
-php ArcTool.php remerge double_index_name which_bundle continue
-php ArcTool.php remerge bundle_name partition_num
-php ArcTool.php remerge double_index_name which_bundle partition_num
+php ArcTool.php remerge bundle_info
+php ArcTool.php remerge bundle_info partition_num
+php ArcTool.php remerge bundle_info partition_num number_of_processes
+php ArcTool.php remerge bundle_info partition_num number_of_processes end_partition
     /*  this operates like the previously described rebuild command except
         if the inverted index files for a partition already exist in that
         partition they are not recomputed (if they don't exist, they
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index 60259c7cc..bc91360ba 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -329,7 +329,7 @@ class IndexDocumentBundle implements CrawlConstants
             "POSITIONS_OFFSET" => "INT", "POSITIONS_LEN" => "INT"],
             $record_compressor);
         $unpack_codes = [0 => "C", 1 => "n", 2=> "N", 3 => "J"];
-        $len_codes = [0 => 1, 1 => 2, 2=> 4, 3 => 8];
+        $len_codes = [0 => 1, 1 => 2, 2 => 4, 3 => 8];
         for ($i = 0; $i < 4; $i++) {
             for ($j = 0; $j < 4; $j++) {
                 for ($k = 0; $k < 4; $k++) {
@@ -789,8 +789,9 @@ class IndexDocumentBundle implements CrawlConstants
         if ($just_stats) {
             $term_stats = [];
             foreach ($this->postings as $term => $postings) {
-                list($posting_records,) = $this->unpackPostings($postings);
-                $term_stats[$term] = count($posting_records);
+                $stat_pos = 0;
+                $num_records = vByteDecode($postings, $stat_pos);
+                $term_stats[$term] = $num_records;
             }
             $statistics = [
                 "NUM_DOCS" => $this->doc_map_counter,
@@ -1159,7 +1160,6 @@ class IndexDocumentBundle implements CrawlConstants
     public function addTermPostingLists($position_offset, $word_lists,
         $meta_ids, $doc_map_index)
     {
-        static $my_counter = 0;
         $postings_tools = $this->postings_tools;
         $last_entries_tools = $this->last_entries_tools;
         foreach ($meta_ids as $meta_id) {
@@ -1167,7 +1167,6 @@ class IndexDocumentBundle implements CrawlConstants
         }
         foreach ($word_lists as $word => $position_list) {
             $term_id = canonicalTerm($word);
-            $meta_prefix = substr($word, 0, 5);
             $occurrences = count($position_list);
             if ($occurrences > 0) {
                 $encoded_position_list = encodePositionList($position_list);
@@ -1190,10 +1189,14 @@ class IndexDocumentBundle implements CrawlConstants
             $diff_doc_map_index = $doc_map_index - $last_index;
             $diff_offset = ($occurrences > 0) ?
                 $offset - $last_offset : 0;
+            //note:pack adds vByteEncode of num rows packed to front
             $entry = $postings_tools->pack([
                 "DOC_MAP_INDEX" => $diff_doc_map_index,
                 "FREQUENCY" => $occurrences, "POSITIONS_OFFSET" => $diff_offset,
                 "POSITIONS_LEN" => $len]);
+            /* multiple entries can be associated with the same term_id.
+               term_id => vbyte_encoded_num_entries entry1 \xFF entry2 ...
+             */
             $postings_tools->add($this->postings, $term_id, $entry,
                 PackedTableTools::ADD_MEM_TABLE, PackedTableTools::APPEND_MODE);
             $add_entry = $last_entries_tools->pack(
@@ -1591,7 +1594,7 @@ class IndexDocumentBundle implements CrawlConstants
         $len_posting_strings = strlen($postings_string);
         for ($i = 0; $i < $num_items; $i++) {
             if (!isset($postings_string[$current_pos])) {
-                 crawlLog("Posting decode error");
+                 crawlLog("Posting decode error - Start beyond posting");
                  crawlLog("..Number to decode items: " . $num_items);
                  crawlLog("..Number decoded: " . $i);
                  crawlLog("..Length posting string: " .
@@ -1603,8 +1606,12 @@ class IndexDocumentBundle implements CrawlConstants
             $current_pos++;
             $len_unpack_info = $unpack_len_map[$int_info];
             if ($current_pos + $len_unpack_info > $len_posting_strings) {
-                crawlLog("Posting decode error");
+                crawlLog("Posting decode error -".
+                    " Decode length longer than string");
+                crawlLog(".. Decode Format Length was: " . $len_unpack_info);
                 crawlLog("..Number to decode items: " . $num_items);
+                crawlLog("..Length needed to decode: " .
+                    ($len_unpack_info * $num_items));
                 crawlLog("..Number decoded: " . $i);
                 crawlLog("..Length posting string: " .
                     strlen($postings_string));
@@ -1733,7 +1740,7 @@ class IndexDocumentBundle implements CrawlConstants
      */
     public static function setArchiveInfo($dir_name, $update_info)
     {
-        $archive_info_path = $dir_name. "/" . self::ARCHIVE_INFO_FILE;
+        $archive_info_path = $dir_name . "/" . self::ARCHIVE_INFO_FILE;
         if (file_exists($archive_info_path)) {
             $info = self::getArchiveInfo($dir_name);
         }
diff --git a/src/library/LSMTree.php b/src/library/LSMTree.php
index 32c69e747..3ead91e5c 100644
--- a/src/library/LSMTree.php
+++ b/src/library/LSMTree.php
@@ -341,7 +341,8 @@ class LSMTree
         {
             $add_rows = $this->getTier($i, $key);
             if (is_array($add_rows)) {
-                $rows += $add_rows;
+                // use array_merge rather than + or get wrong results here
+                $rows = array_merge($rows, $add_rows);
             }
             if ($limit > 0 && count($rows) > $max_rows) {
                 break;
diff --git a/src/library/PackedTableTools.php b/src/library/PackedTableTools.php
index 950d1580c..9244de6f8 100644
--- a/src/library/PackedTableTools.php
+++ b/src/library/PackedTableTools.php
@@ -237,7 +237,7 @@ class PackedTableTools
             case self::ADD_FILE_PATH:
                 $separator = (fsize($table) > 0) ? "\xFF" : "";
                 $out = $separator . encode255($encode_key . $table_row);
-                return (file_put_contents($table, $out , FILE_APPEND) > 0);
+                return (file_put_contents($table, $out, FILE_APPEND) > 0);
             case self::ADD_FILE_HANDLE:
                 $separator = (ftell($table) > 0) ? "\xFF" : "";
                 $out = $separator . encode255($encode_key . $table_row);
diff --git a/src/library/index_bundle_iterators/GroupIterator.php b/src/library/index_bundle_iterators/GroupIterator.php
index 83581e3d0..61c400d31 100644
--- a/src/library/index_bundle_iterators/GroupIterator.php
+++ b/src/library/index_bundle_iterators/GroupIterator.php
@@ -208,7 +208,7 @@ class GroupIterator extends IndexBundleIterator
                     $pages = -1;
                 }
             } else if (!empty($new_pages)) {
-                $pages += $new_pages;
+                $pages = array_merge($pages, $new_pages);
                 $count = count($pages);
             }
             if ($count < $this->results_per_block && !$done) {
diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php
index a1dcb9bd0..f13d259ac 100644
--- a/src/library/index_bundle_iterators/WordIterator.php
+++ b/src/library/index_bundle_iterators/WordIterator.php
@@ -1007,7 +1007,8 @@ class WordIterator extends IndexBundleIterator
             empty($generation_info['POSTINGS_LEN'])) {
             $postings_entry = "";
         } else {
-            $postings_entry = $index->getPostingsString($generation,
+            $postings_entry = $index->getPostingsString(
+                $generation_info['PARTITION'],
                 $generation_info['POSTINGS_OFFSET'],
                 $generation_info['POSTINGS_LEN']);
         }
diff --git a/tests/PackedTableToolsTest.php b/tests/PackedTableToolsTest.php
index b21dc6e1f..45a8175df 100644
--- a/tests/PackedTableToolsTest.php
+++ b/tests/PackedTableToolsTest.php
@@ -144,11 +144,15 @@ use seekquarry\yioop\library\UnitTest;
             ["PRIMARY KEY" => "ID", "A" => "INT", "B" => "TEXT",
              "C" => "INT"], C\NS_COMPRESSORS . "GzipCompressor");
         $table = [];
-        $hash_key = md5("1", true);
-        $table_factory->add($table, $hash_key,
-            $table_factory->pack(["A" => 5, "B" => "Hello World", "C"=> 256]));
-        $hash_key = md5("2", true);
-        $table_factory->add($table, $hash_key,
+        $hash_key1 = md5("1", true);
+        $table_factory->add($table, $hash_key1,
+            $table_factory->pack(["A" => 5, "B" => "Hello World", "C"=> 256]),
+            mode: L\PackedTableTools::APPEND_MODE);
+        $table_factory->add($table, $hash_key1,
+            $table_factory->pack(["A" => 6, "B" => "Hello World2", "C"=> 257]),
+            mode: L\PackedTableTools::APPEND_MODE);
+        $hash_key2 = md5("2", true);
+        $table_factory->add($table, $hash_key2,
             $table_factory->pack(["A" => 20000, "B" => "laladida",
             "C"=> 5600]));
         $table_factory->save(self::TEST_DIR . "/save.txt", $table);
@@ -159,5 +163,10 @@ use seekquarry\yioop\library\UnitTest;
         $loaded_table2 = $table_factory2->load(self::TEST_DIR . "/save2.txt");
         $this->assertEqual($table, $loaded_table2,
             "Add two rows, save, load compressed table gives same result.");
+        $entry1 = $loaded_table[$hash_key1];
+        $start = 0;
+        $num_entries = L\vByteDecode($entry1, $start);
+        $this->assertEqual($num_entries, 2,
+            "Row one has correct number of sub-entries.");
     }
 }
ViewGit