Adds the ability to recompute ix files for paritions to ArcTool.php, a=chris

Chris Pollett [2022-08-13 22:Aug:th]
Adds the ability to recompute ix files for paritions to ArcTool.php, a=chris
Filename
src/executables/ArcTool.php
src/library/IndexDocumentBundle.php
src/library/processors/RobotProcessor.php
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php
index 8b48ebcda..3699ef265 100755
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
@@ -177,6 +177,15 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants
             case "partition":
                 $this->outputPartitionInfo($path, $argv[3]);
                 break;
+            case "fix-partition":
+                if (!isset($argv[3])) {
+                    $argv[3] = 0;
+                }
+                if (!isset($argv[4])) {
+                    $argv[4] = -1;
+                }
+                $this->fixPartitionIndexes($path, $argv[3], $argv[4]);
+                break;
             case "rebuild":
                 if (!isset($argv[3])) {
                     $argv[3] = 0;
@@ -430,7 +439,7 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants
      * within the $index'th partition in the bundle
      *
      * @param string $archive_path the path of a directory that holds
-     *     an IndexArchiveBundle
+     *     an IndexDocumentBundle
      * @param int $num of partition to show info for
      */
     public function outputPartitionInfo($archive_path, $num)
@@ -488,6 +497,136 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants
             $i ++;
         }
     }
+    /**
+     * Recomputes the hash index (.ix) files for a range of partitions
+     * from start_partition to end_partition in the documents subfolder of
+     * an IndexDocumentBundle. An ix file contains a sequence of compressed
+     * 4-tuple (doc_id, summary_offset, summary_length, cache_length)
+     * corresponding to a partition file (these end in .txt.gz and are
+     * a sequence of compressed document summaries followed by orginal
+     * documents).
+     * @param string $archive_path the path of a directory that holds
+     *     an IndexDocumentBundle
+     * @param int $start_partition first partition to recompute
+     * @param int $end_partition last partition to recompute (inclusive)
+     */
+    public function fixPartitionIndexes($archive_path, $start_partition,
+        $end_partition = -1)
+    {
+        if (preg_match("/\-\d$/", $archive_path)) {
+            $bundle_num = substr($archive_path, -1);
+            $archive_path = substr($archive_path, 0, -2);
+        }
+        $bundle_name = $this->getArchiveName($archive_path);
+        echo "\nBundle Name: $bundle_name\n";
+        $archive_type = $this->getArchiveKind($archive_path);
+        echo "Bundle Type: $archive_type\n";
+        if (!in_array($archive_type, ["FeedArchiveBundle",
+            "DoubleIndexBundle", "IndexDocumentBundle",])) {
+            $this->badFormatMessageAndExit($archive_path, "index");
+        }
+        preg_match("/\d+$/", $archive_path, $matches);
+        $index_timestamp = (isset($matches[0])) ? $matches[0] : 0;
+        if (isset($bundle_num) && $bundle_num >= 0) {
+            $index_timestamp .= "-$bundle_num";
+        } else if ($bundle_name == "IndexDataFeed") {
+            $index_timestamp = "feed";
+        }
+        $index = IndexManager::getIndex($index_timestamp);
+        $partition_bundle = $index->documents;
+        if ($end_partition == -1) {
+            $end_partition = $partition_bundle->parameters["SAVE_PARTITION"];
+        }
+        $partition_tools = new PackedTableTools(
+            ["PRIMARY KEY" => ["DOC_ID", IndexDocumentBundle::DOCID_LEN],
+            "SUMMARY_OFFSET" => "INT", "SUMMARY_LENGTH" => "INT",
+            "CACHE_PAGE_LENGTH" => "INT"]);
+        $buffer_size = 1000000;
+        $gz_start = '/\x1F\x8B\x08\x00\x00\x00\x00.../';
+        for ($i = $start_partition; $i <= $end_partition; $i++) {
+            $partition_filename = $partition_bundle->getPartition($i);
+            echo "Reconstructing Index File (.ix) for $partition_filename\n";
+            if (!file_exists($partition_filename)) {
+                echo "\nPartition File: $partition_filename does not exists!";
+                echo "\nStopping!";
+                exit();
+            }
+            $first_time = true;
+            $fh = fopen($partition_filename, "r");
+            $remainder = "";
+            $offset = 0;
+            $is_summary = true;
+            $cnt = 0;
+            $partition_index = "";
+            do {
+                $gztext = fread($fh, $buffer_size);
+                $objects = preg_split($gz_start, $remainder . $gztext);
+                $num_objects = count($objects);
+                $start = ($first_time) ? 1 : 0;
+                $first_time = false;
+                for ($j = $start ; $j < $num_objects; $j++) {
+                    $compress = $objects[$j];
+                    if ($j == $num_objects - 1 && !feof($fh)) {
+                        $remainder = $compress;
+                        break;
+                    }
+                    $len = 10 + strlen($compress);
+                    $site_string = @gzinflate($compress);
+                    if (empty($site_string)) {
+                        echo "Couldn't uncompress item $cnt\n";
+                        $offset += $len;
+                        $last_summary = false;
+                        continue;
+                    }
+                    $is_summary = (substr($site_string, 0, 2) == 'a:') ? true :
+                        false;
+                    $is_cache = (substr($site_string, 0, 2) == 's:') ? true :
+                        false;
+                    if ($is_summary) {
+                        $site = @unserialize($site_string);
+                        $last_summary = true;
+                        if (empty($site)) {
+                            echo "Couldn't unserialize item $cnt\n";
+                            $offset += $len;
+                            continue;
+                        } else if (is_string($site)) {
+                            echo "Item $cnt is a cache page not a summary.\n";
+                            $offset += $len;
+                            continue;
+                        }
+                        $doc_id = IndexDocumentBundle::computeDocId($site);
+                        $summary_offset = $offset;
+                        $summary_len = $len;
+                    } else if ($is_cache && $last_summary) {
+                        $last_summary = false;
+                        $cache_len = $len;
+                        if ($cnt % 1000 == 0) {
+                            echo "...extracted and indexed $cnt items from:\n" .
+                                "$partition_filename\n";
+                            echo "Last Summary Offset: $summary_offset," .
+                                " Summary Length: $summary_len, Cache Length: ".
+                                "$cache_len\n";
+                        }
+                        $out_value = $partition_tools->pack([
+                            "SUMMARY_OFFSET" => $summary_offset,
+                            "SUMMARY_LENGTH" => $summary_len,
+                            "CACHE_PAGE_LENGTH" => $cache_len
+                        ]);
+                        $partition_tools->add($partition_index, $doc_id,
+                            $out_value, $partition_tools::ADD_MEM_TABLE_STRING);
+                        $cnt++;
+                    } else {
+                        $last_summary = false;
+                    }
+                    $offset += $len;
+                }
+            } while(!feof($fh));
+            $partition_index_name =
+                $partition_bundle->getPartitionIndex($i);
+            echo "Saving $partition_index_name.\n";
+            $partition_tools->save($partition_index_name, $partition_index);
+        }
+    }
     /**
      * Counts and outputs the number of docs and links in each shard
      * in the archive supplied in $archive_path as well as an overall count
@@ -1310,6 +1449,18 @@ php ArcTool.php dict double_index_name which_bundle word start_record num_record
        doc in a record (as opposed to just their total numer) is printed
      */

+php ArcTool.php fix-partition bundle_name
+php ArcTool.php fix-partition bundle_name start_partition
+php ArcTool.php fix-partition bundle_name start_partition end_partition
+    /*  recomputes the hash index (.ix) files for a range of partitions
+        from start_partition to end_partition in the documents subfolder of
+        an IndexDocumentBundle. An ix file contains a sequence of compressed
+        4-tuple (doc_id, summary_offset, summary_length, cache_length)
+        corresponding to a partition file (these end in .txt.gz and are
+        a sequence of compressed document summaries followed by orginal
+        documents).
+     */
+
 php ArcTool.php info bundle_name
     // return info about documents stored in archive.

diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index 6ec4168bf..99828521f 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -76,6 +76,10 @@ class IndexDocumentBundle implements CrawlConstants
      * Length of DocIds used by this IndexDocumentBundle
      */
     const DOCID_LEN = 24;
+    /**
+     * Length of TermIds used by this IndexDocumentBundle
+     */
+    const TERMID_LEN = 16;
     /**
      * Partition i in an IndexDocumentBundle has a subfolder i
      * within self::POSITIONS_DOC_MAP_FOLDER. Within this subfolder i,
@@ -293,12 +297,13 @@ class IndexDocumentBundle implements CrawlConstants
             $this->documents->index_cache_size = 1;
         }
         $this->doc_map_tools = new PackedTableTools([
-            "PRIMARY KEY" => ["DOC_KEYS", 24], "POS" => "INT",
+            "PRIMARY KEY" => ["DOC_KEYS", self::DOCID_LEN], "POS" => "INT",
             "SCORE" => "FLOAT"], $record_compressor);
         $this->postings_tools = new PackedTableTools([
-            "PRIMARY KEY" => ["TERM", 16], "DOC_MAP_INDEX" => "INT",
-                "FREQUENCY" => "INT", "POSITIONS_OFFSET" => "INT",
-                "POSITIONS_LEN" => "INT"], $record_compressor);
+            "PRIMARY KEY" => ["TERM", self::TERMID_LEN],
+            "DOC_MAP_INDEX" => "INT", "FREQUENCY" => "INT",
+            "POSITIONS_OFFSET" => "INT", "POSITIONS_LEN" => "INT"],
+            $record_compressor);
         $unpack_codes = [0 => "C", 1 => "n", 2=> "N", 3 => "J"];
         $len_codes = [0 => 1, 1 => 2, 2=> 4, 3 => 8];
         for ($i = 0; $i < 4; $i++) {
@@ -936,7 +941,7 @@ class IndexDocumentBundle implements CrawlConstants
      *
      * @param array $site site to compute doc_id for
      */
-    public function computeDocId($site)
+    public static function computeDocId($site)
     {
         $doc_id = false;
         if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
diff --git a/src/library/processors/RobotProcessor.php b/src/library/processors/RobotProcessor.php
index d496c3e60..34105f4f9 100644
--- a/src/library/processors/RobotProcessor.php
+++ b/src/library/processors/RobotProcessor.php
@@ -61,8 +61,8 @@ class RobotProcessor extends PageProcessor
     {
         parent::__construct($plugins, $max_description_len,
             $max_links_to_extract, $summarizer_option);
-        /** Register File Types We Handle*/
-        self::$indexed_file_types[] = "pdf";
+        /** Register File Types We Handle
+         */
         self::$mime_processor["text/robot"] = "RobotProcessor";
     }
     /**
ViewGit