Gte show pages to work again in ArcTool.php, a=chris
Gte show pages to work again in ArcTool.php, a=chris
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php
index 6e5b4c0a9..43d80f8c9 100755
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
@@ -37,13 +37,12 @@ use seekquarry\yioop\library\CrawlConstants;
use seekquarry\yioop\library\FetchUrl;
use seekquarry\yioop\library\IndexArchiveBundle;
use seekquarry\yioop\library\IndexDocumentBundle;
-use seekquarry\yioop\library\IndexDictionary;
use seekquarry\yioop\library\IndexManager;
use seekquarry\yioop\library\IndexShard;
+use seekquarry\yioop\library\PartitionDocumentBundle;
use seekquarry\yioop\library\PhraseParser;
use seekquarry\yioop\library\UrlParser;
use seekquarry\yioop\library\WebArchiveBundle;
-use seekquarry\yioop\library\WebQueueBundle;
use seekquarry\yioop\library\media_jobs\FeedsUpdateJob;
use seekquarry\yioop\controllers\AdminController;
@@ -264,7 +263,7 @@ class ArcTool implements CrawlConstants
}
}
/**
- * Prints the IndexDictionary records for a word in an IndexArchiveBundle
+ * Prints the dictionary records for a word in an IndexDocumentBundle
*
* @param string $archive_path the path of a directory that holds
* an IndexArchiveBundle
@@ -724,27 +723,20 @@ class ArcTool implements CrawlConstants
$iterator = (object) [];
$iterator->end_of_iterator = false;
$archive_name = C\NS_LIB . $archive_type;
- if ($archive_type == "IndexArchiveBundle") {
+ if ($archive_type == "IndexDocumentBundle" ||
+ $archive_type == "FeedDocumentBundle") {
$info = $archive_name::getArchiveInfo($archive_path);
- $num = min($num, $info["COUNT"] - $start);
- $generation_info = unserialize(
- file_get_contents("$archive_path/generation.txt"));
- $num_generations = $generation_info['ACTIVE'] + 1;
- $archive = new WebArchiveBundle($archive_path . "/summaries");
+ $num = min($num, $info["ACTIVE_COUNT"] + $info["COUNT"] - $start);
+ $num_generations = $info['SAVE_PARTITION'] + 1;
+ $index_archive = new IndexDocumentBundle($archive_path);
+ $archive = $index_archive->documents;
} else if ($archive_type == "DoubleIndexBundle") {
- $info = $archive_name::getArchiveInfo($archive_path);
- $num = min($num, $info["COUNT"] - $start);
$bundle_path = "$archive_path/bundle$bundle_num";
- $generation_info = unserialize(
- file_get_contents("$bundle_path/generation.txt"));
- $num_generations = $generation_info['ACTIVE'] + 1;
- echo $bundle_path . "/summaries";
- $archive = new WebArchiveBundle($bundle_path . "/summaries");
- } else if ($archive_type == "WebArchiveBundle") {
- $info = $archive_name::getArchiveInfo($archive_path);
- $num = min($num, $info["COUNT"] - $start);
- $num_generations = $info["WRITE_PARTITION"] + 1;
- $archive = new WebArchiveBundle($archive_path);
+ $info = IndexDocumentBundle::getArchiveInfo($bundle_path);
+ $num = min($num, $info["ACTIVE_COUNT"] + $info["COUNT"] - $start);
+ $num_generations = $info['SAVE_PARTITION'] + 1;
+ $index_archive = new IndexDocumentBundle($archive_path);
+ $archive = $index_archive->documents;
} else {
$nonyioop = true;
$num_generations = 1;
@@ -755,8 +747,8 @@ class ArcTool implements CrawlConstants
$this->badFormatMessageAndExit($archive_path);
}
}
- if (!$nonyioop) {
- if (isset($this->tmp_results)) unset($this->tmp_results);
+ if (!$nonyioop && isset($this->tmp_results)) {
+ unset($this->tmp_results);
}
$num = max($num, 0);
$total = $start + $num;
@@ -766,29 +758,42 @@ class ArcTool implements CrawlConstants
$seen < $total && $generation < $num_generations) {
if ($nonyioop) {
$partition = (object) [];
- $partition->count = 1;
+ $partition_count = 1;
$iterator->seekPage($start);
- if ($iterator->end_of_iterator) { break; }
+ if ($iterator->end_of_iterator) {
+ break;
+ }
$seen += $start;
} else {
- $partition = $archive->getPartition($generation, false);
- if ($partition->count < $start && $seen < $start) {
+ $partition = $archive->loadPartitionIndex($generation);
+ $partition_count = count($partition);
+ if ($seen + $partition_count < $start) {
$generation++;
- $seen += $partition->count;
+ $seen += $partition_count;
continue;
}
+ $keys = array_keys($partition);
+ unset($partition);
+ $seen_generation = 0;
}
$seen_generation = 0;
- while($seen < $total && $seen_generation < $partition->count) {
+ while($seen < $total && $seen_generation < $partition_count) {
if ($nonyioop) {
$num_to_get = min(self::MAX_BUFFER_DOCS, $total - $seen);
$objects = $iterator->nextPages($num_to_get);
$seen += count($objects);
} else {
$num_to_get = min($total - $seen,
- $partition->count - $seen_generation,
+ $partition_count - $seen_generation,
self::MAX_BUFFER_DOCS);
- $objects = $partition->nextObjects($num_to_get);
+ $objects = [];
+ for ($i = $seen_generation; $i < $num_to_get; $i++) {
+ $object = $archive->get($keys[$i], $generation);
+ $summary = $object[self::SUMMARY];
+ unset($object[self::SUMMARY]);
+ $object = array_merge($object, $summary);
+ $objects[] = $object;
+ }
$seen += $num_to_get;
$seen_generation += $num_to_get;
}
@@ -797,15 +802,9 @@ class ArcTool implements CrawlConstants
$num_to_show = min($seen - $start, $num_to_get);
$cnt = 0;
$first = $num_to_get - $num_to_show;
- foreach ($objects as $pre_object) {
+ foreach ($objects as $object) {
if ($cnt >= $first) {
$out = "";
- if ($nonyioop) {
- $object = $pre_object;
- } else {
- if (!isset($pre_object[1])) continue;
- $object = $pre_object[1];
- }
if (isset($object[self::TIMESTAMP])) {
$object[self::TIMESTAMP] =
date("r", $object[self::TIMESTAMP]);