Gte show pages to work again in ArcTool.php, a=chris

Chris Pollett [2021-09-10 19:Sep:th]
Gte show pages to work again in ArcTool.php, a=chris
Filename
src/executables/ArcTool.php
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php
index 6e5b4c0a9..43d80f8c9 100755
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
@@ -37,13 +37,12 @@ use seekquarry\yioop\library\CrawlConstants;
 use seekquarry\yioop\library\FetchUrl;
 use seekquarry\yioop\library\IndexArchiveBundle;
 use seekquarry\yioop\library\IndexDocumentBundle;
-use seekquarry\yioop\library\IndexDictionary;
 use seekquarry\yioop\library\IndexManager;
 use seekquarry\yioop\library\IndexShard;
+use seekquarry\yioop\library\PartitionDocumentBundle;
 use seekquarry\yioop\library\PhraseParser;
 use seekquarry\yioop\library\UrlParser;
 use seekquarry\yioop\library\WebArchiveBundle;
-use seekquarry\yioop\library\WebQueueBundle;
 use seekquarry\yioop\library\media_jobs\FeedsUpdateJob;
 use seekquarry\yioop\controllers\AdminController;

@@ -264,7 +263,7 @@ class ArcTool implements CrawlConstants
         }
     }
     /**
-     * Prints the IndexDictionary records for a word in an IndexArchiveBundle
+     * Prints the dictionary records for a word in an IndexDocumentBundle
      *
      * @param string $archive_path the path of a directory that holds
      *     an IndexArchiveBundle
@@ -724,27 +723,20 @@ class ArcTool implements CrawlConstants
         $iterator =  (object) [];
         $iterator->end_of_iterator = false;
         $archive_name = C\NS_LIB . $archive_type;
-        if ($archive_type == "IndexArchiveBundle") {
+        if ($archive_type == "IndexDocumentBundle" ||
+            $archive_type == "FeedDocumentBundle") {
             $info = $archive_name::getArchiveInfo($archive_path);
-            $num = min($num, $info["COUNT"] - $start);
-            $generation_info = unserialize(
-                file_get_contents("$archive_path/generation.txt"));
-            $num_generations = $generation_info['ACTIVE'] + 1;
-            $archive = new WebArchiveBundle($archive_path . "/summaries");
+            $num = min($num, $info["ACTIVE_COUNT"] + $info["COUNT"] - $start);
+            $num_generations = $info['SAVE_PARTITION'] + 1;
+            $index_archive = new IndexDocumentBundle($archive_path);
+            $archive = $index_archive->documents;
         } else if ($archive_type == "DoubleIndexBundle") {
-            $info = $archive_name::getArchiveInfo($archive_path);
-            $num = min($num, $info["COUNT"] - $start);
             $bundle_path = "$archive_path/bundle$bundle_num";
-            $generation_info = unserialize(
-                file_get_contents("$bundle_path/generation.txt"));
-            $num_generations = $generation_info['ACTIVE'] + 1;
-            echo $bundle_path . "/summaries";
-            $archive = new WebArchiveBundle($bundle_path . "/summaries");
-        } else if ($archive_type == "WebArchiveBundle") {
-            $info = $archive_name::getArchiveInfo($archive_path);
-            $num = min($num, $info["COUNT"] - $start);
-            $num_generations = $info["WRITE_PARTITION"] + 1;
-            $archive = new WebArchiveBundle($archive_path);
+            $info = IndexDocumentBundle::getArchiveInfo($bundle_path);
+            $num = min($num, $info["ACTIVE_COUNT"] + $info["COUNT"] - $start);
+            $num_generations = $info['SAVE_PARTITION'] + 1;
+            $index_archive = new IndexDocumentBundle($archive_path);
+            $archive = $index_archive->documents;
         } else {
             $nonyioop = true;
             $num_generations = 1;
@@ -755,8 +747,8 @@ class ArcTool implements CrawlConstants
                 $this->badFormatMessageAndExit($archive_path);
             }
         }
-        if (!$nonyioop) {
-            if (isset($this->tmp_results)) unset($this->tmp_results);
+        if (!$nonyioop && isset($this->tmp_results)) {
+                unset($this->tmp_results);
         }
         $num = max($num, 0);
         $total = $start + $num;
@@ -766,29 +758,42 @@ class ArcTool implements CrawlConstants
             $seen < $total && $generation < $num_generations) {
             if ($nonyioop) {
                 $partition = (object) [];
-                $partition->count = 1;
+                $partition_count = 1;
                 $iterator->seekPage($start);
-                if ($iterator->end_of_iterator) { break; }
+                if ($iterator->end_of_iterator) {
+                    break;
+                }
                 $seen += $start;
             } else {
-                $partition = $archive->getPartition($generation, false);
-                if ($partition->count < $start && $seen < $start) {
+                $partition = $archive->loadPartitionIndex($generation);
+                $partition_count = count($partition);
+                if ($seen + $partition_count < $start) {
                     $generation++;
-                    $seen += $partition->count;
+                    $seen += $partition_count;
                     continue;
                 }
+                $keys = array_keys($partition);
+                unset($partition);
+                $seen_generation = 0;
             }
             $seen_generation = 0;
-            while($seen < $total && $seen_generation < $partition->count) {
+            while($seen < $total && $seen_generation < $partition_count) {
                 if ($nonyioop) {
                     $num_to_get = min(self::MAX_BUFFER_DOCS, $total - $seen);
                     $objects = $iterator->nextPages($num_to_get);
                     $seen += count($objects);
                 } else {
                     $num_to_get = min($total - $seen,
-                        $partition->count - $seen_generation,
+                        $partition_count - $seen_generation,
                         self::MAX_BUFFER_DOCS);
-                    $objects = $partition->nextObjects($num_to_get);
+                    $objects = [];
+                    for ($i = $seen_generation; $i < $num_to_get; $i++) {
+                        $object = $archive->get($keys[$i], $generation);
+                        $summary = $object[self::SUMMARY];
+                        unset($object[self::SUMMARY]);
+                        $object = array_merge($object, $summary);
+                        $objects[] = $object;
+                    }
                     $seen += $num_to_get;
                     $seen_generation += $num_to_get;
                 }
@@ -797,15 +802,9 @@ class ArcTool implements CrawlConstants
                     $num_to_show = min($seen - $start, $num_to_get);
                     $cnt = 0;
                     $first = $num_to_get - $num_to_show;
-                    foreach ($objects as $pre_object) {
+                    foreach ($objects as $object) {
                         if ($cnt >= $first) {
                             $out = "";
-                            if ($nonyioop) {
-                                $object = $pre_object;
-                            } else {
-                                if (!isset($pre_object[1])) continue;
-                                $object = $pre_object[1];
-                            }
                             if (isset($object[self::TIMESTAMP])) {
                                 $object[self::TIMESTAMP] =
                                     date("r", $object[self::TIMESTAMP]);
ViewGit