diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php
index 8b48ebcda..3699ef265 100755
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
@@ -177,6 +177,15 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants
case "partition":
$this->outputPartitionInfo($path, $argv[3]);
break;
+ case "fix-partition":
+ if (!isset($argv[3])) {
+ $argv[3] = 0;
+ }
+ if (!isset($argv[4])) {
+ $argv[4] = -1;
+ }
+ $this->fixPartitionIndexes($path, $argv[3], $argv[4]);
+ break;
case "rebuild":
if (!isset($argv[3])) {
$argv[3] = 0;
@@ -430,7 +439,7 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants
* within the $index'th partition in the bundle
*
* @param string $archive_path the path of a directory that holds
- * an IndexArchiveBundle
+ * an IndexDocumentBundle
* @param int $num of partition to show info for
*/
public function outputPartitionInfo($archive_path, $num)
@@ -488,6 +497,136 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants
$i ++;
}
}
+ /**
+ * Recomputes the hash index (.ix) files for a range of partitions
+ * from start_partition to end_partition in the documents subfolder of
+ * an IndexDocumentBundle. An ix file contains a sequence of compressed
+ * 4-tuple (doc_id, summary_offset, summary_length, cache_length)
+ * corresponding to a partition file (these end in .txt.gz and are
+ * a sequence of compressed document summaries followed by orginal
+ * documents).
+ * @param string $archive_path the path of a directory that holds
+ * an IndexDocumentBundle
+ * @param int $start_partition first partition to recompute
+ * @param int $end_partition last partition to recompute (inclusive)
+ */
+ public function fixPartitionIndexes($archive_path, $start_partition,
+ $end_partition = -1)
+ {
+ if (preg_match("/\-\d$/", $archive_path)) {
+ $bundle_num = substr($archive_path, -1);
+ $archive_path = substr($archive_path, 0, -2);
+ }
+ $bundle_name = $this->getArchiveName($archive_path);
+ echo "\nBundle Name: $bundle_name\n";
+ $archive_type = $this->getArchiveKind($archive_path);
+ echo "Bundle Type: $archive_type\n";
+ if (!in_array($archive_type, ["FeedArchiveBundle",
+ "DoubleIndexBundle", "IndexDocumentBundle",])) {
+ $this->badFormatMessageAndExit($archive_path, "index");
+ }
+ preg_match("/\d+$/", $archive_path, $matches);
+ $index_timestamp = (isset($matches[0])) ? $matches[0] : 0;
+ if (isset($bundle_num) && $bundle_num >= 0) {
+ $index_timestamp .= "-$bundle_num";
+ } else if ($bundle_name == "IndexDataFeed") {
+ $index_timestamp = "feed";
+ }
+ $index = IndexManager::getIndex($index_timestamp);
+ $partition_bundle = $index->documents;
+ if ($end_partition == -1) {
+ $end_partition = $partition_bundle->parameters["SAVE_PARTITION"];
+ }
+ $partition_tools = new PackedTableTools(
+ ["PRIMARY KEY" => ["DOC_ID", IndexDocumentBundle::DOCID_LEN],
+ "SUMMARY_OFFSET" => "INT", "SUMMARY_LENGTH" => "INT",
+ "CACHE_PAGE_LENGTH" => "INT"]);
+ $buffer_size = 1000000;
+ $gz_start = '/\x1F\x8B\x08\x00\x00\x00\x00.../';
+ for ($i = $start_partition; $i <= $end_partition; $i++) {
+ $partition_filename = $partition_bundle->getPartition($i);
+ echo "Reconstructing Index File (.ix) for $partition_filename\n";
+ if (!file_exists($partition_filename)) {
+ echo "\nPartition File: $partition_filename does not exists!";
+ echo "\nStopping!";
+ exit();
+ }
+ $first_time = true;
+ $fh = fopen($partition_filename, "r");
+ $remainder = "";
+ $offset = 0;
+ $is_summary = true;
+ $cnt = 0;
+ $partition_index = "";
+ do {
+ $gztext = fread($fh, $buffer_size);
+ $objects = preg_split($gz_start, $remainder . $gztext);
+ $num_objects = count($objects);
+ $start = ($first_time) ? 1 : 0;
+ $first_time = false;
+ for ($j = $start ; $j < $num_objects; $j++) {
+ $compress = $objects[$j];
+ if ($j == $num_objects - 1 && !feof($fh)) {
+ $remainder = $compress;
+ break;
+ }
+ $len = 10 + strlen($compress);
+ $site_string = @gzinflate($compress);
+ if (empty($site_string)) {
+ echo "Couldn't uncompress item $cnt\n";
+ $offset += $len;
+ $last_summary = false;
+ continue;
+ }
+ $is_summary = (substr($site_string, 0, 2) == 'a:') ? true :
+ false;
+ $is_cache = (substr($site_string, 0, 2) == 's:') ? true :
+ false;
+ if ($is_summary) {
+ $site = @unserialize($site_string);
+ $last_summary = true;
+ if (empty($site)) {
+ echo "Couldn't unserialize item $cnt\n";
+ $offset += $len;
+ continue;
+ } else if (is_string($site)) {
+ echo "Item $cnt is a cache page not a summary.\n";
+ $offset += $len;
+ continue;
+ }
+ $doc_id = IndexDocumentBundle::computeDocId($site);
+ $summary_offset = $offset;
+ $summary_len = $len;
+ } else if ($is_cache && $last_summary) {
+ $last_summary = false;
+ $cache_len = $len;
+ if ($cnt % 1000 == 0) {
+ echo "...extracted and indexed $cnt items from:\n" .
+ "$partition_filename\n";
+ echo "Last Summary Offset: $summary_offset," .
+ " Summary Length: $summary_len, Cache Length: ".
+ "$cache_len\n";
+ }
+ $out_value = $partition_tools->pack([
+ "SUMMARY_OFFSET" => $summary_offset,
+ "SUMMARY_LENGTH" => $summary_len,
+ "CACHE_PAGE_LENGTH" => $cache_len
+ ]);
+ $partition_tools->add($partition_index, $doc_id,
+ $out_value, $partition_tools::ADD_MEM_TABLE_STRING);
+ $cnt++;
+ } else {
+ $last_summary = false;
+ }
+ $offset += $len;
+ }
+ } while(!feof($fh));
+ $partition_index_name =
+ $partition_bundle->getPartitionIndex($i);
+ echo "Saving $partition_index_name.\n";
+ $partition_tools->save($partition_index_name, $partition_index);
+ }
+ }
/**
* Counts and outputs the number of docs and links in each shard
* in the archive supplied in $archive_path as well as an overall count
@@ -1310,6 +1449,18 @@ php ArcTool.php dict double_index_name which_bundle word start_record num_record
doc in a record (as opposed to just their total numer) is printed
*/
+php ArcTool.php fix-partition bundle_name
+php ArcTool.php fix-partition bundle_name start_partition
+php ArcTool.php fix-partition bundle_name start_partition end_partition
+ /* recomputes the hash index (.ix) files for a range of partitions
+ from start_partition to end_partition in the documents subfolder of
+ an IndexDocumentBundle. An ix file contains a sequence of compressed
+ 4-tuple (doc_id, summary_offset, summary_length, cache_length)
+ corresponding to a partition file (these end in .txt.gz and are
+ a sequence of compressed document summaries followed by orginal
+ documents).
+ */
+
php ArcTool.php info bundle_name
// return info about documents stored in archive.
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index 6ec4168bf..99828521f 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -76,6 +76,10 @@ class IndexDocumentBundle implements CrawlConstants
* Length of DocIds used by this IndexDocumentBundle
*/
const DOCID_LEN = 24;
+ /**
+ * Length of TermIds used by this IndexDocumentBundle
+ */
+ const TERMID_LEN = 16;
/**
* Partition i in an IndexDocumentBundle has a subfolder i
* within self::POSITIONS_DOC_MAP_FOLDER. Within this subfolder i,
@@ -293,12 +297,13 @@ class IndexDocumentBundle implements CrawlConstants
$this->documents->index_cache_size = 1;
}
$this->doc_map_tools = new PackedTableTools([
- "PRIMARY KEY" => ["DOC_KEYS", 24], "POS" => "INT",
+ "PRIMARY KEY" => ["DOC_KEYS", self::DOCID_LEN], "POS" => "INT",
"SCORE" => "FLOAT"], $record_compressor);
$this->postings_tools = new PackedTableTools([
- "PRIMARY KEY" => ["TERM", 16], "DOC_MAP_INDEX" => "INT",
- "FREQUENCY" => "INT", "POSITIONS_OFFSET" => "INT",
- "POSITIONS_LEN" => "INT"], $record_compressor);
+ "PRIMARY KEY" => ["TERM", self::TERMID_LEN],
+ "DOC_MAP_INDEX" => "INT", "FREQUENCY" => "INT",
+ "POSITIONS_OFFSET" => "INT", "POSITIONS_LEN" => "INT"],
+ $record_compressor);
$unpack_codes = [0 => "C", 1 => "n", 2=> "N", 3 => "J"];
$len_codes = [0 => 1, 1 => 2, 2=> 4, 3 => 8];
for ($i = 0; $i < 4; $i++) {
@@ -936,7 +941,7 @@ class IndexDocumentBundle implements CrawlConstants
*
* @param array $site site to compute doc_id for
*/
- public function computeDocId($site)
+ public static function computeDocId($site)
{
$doc_id = false;
if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
diff --git a/src/library/processors/RobotProcessor.php b/src/library/processors/RobotProcessor.php
index d496c3e60..34105f4f9 100644
--- a/src/library/processors/RobotProcessor.php
+++ b/src/library/processors/RobotProcessor.php
@@ -61,8 +61,8 @@ class RobotProcessor extends PageProcessor
{
parent::__construct($plugins, $max_description_len,
$max_links_to_extract, $summarizer_option);
- /** Register File Types We Handle*/
- self::$indexed_file_types[] = "pdf";
+ /** Register File Types We Handle
+ */
self::$mime_processor["text/robot"] = "RobotProcessor";
}
/**