add a just_stats option for FeedDocumentBundle, a=chris

Chris Pollett [2021-09-10 01:Sep:th]
add a just_stats option for FeedDocumentBundle, a=chris
Filename
src/library/FeedDocumentBundle.php
src/library/IndexDocumentBundle.php
diff --git a/src/library/FeedDocumentBundle.php b/src/library/FeedDocumentBundle.php
index c8b26bfd8..a672fd70c 100644
--- a/src/library/FeedDocumentBundle.php
+++ b/src/library/FeedDocumentBundle.php
@@ -172,10 +172,14 @@ class FeedDocumentBundle extends IndexDocumentBundle
      *  inverted index takes too long (whether SCHEDULE_DIR/crawl_status.txt)
      *  has been recently modified is used in crawling to see if have run out
      *  of new data and the crawl can stopped.
-     * @return bool whether job executed to complete
+     * @param bool $just_stats whether to just compute stats on the inverted
+     *      or to actually  save the results
+     * @return mixed whether job executed to completion (true or false) if
+     *      !$just_stats, otherwise, an array with NUM_DOCS, NUM_LINKS,
+     *      and TERM_STATISTICS (the latter having term frequency info)
      */
     public function buildInvertedIndexPartition($partition = -1,
-        $taking_too_long_touch = null)
+        $taking_too_long_touch = null, $just_stats = false)
     {
         $age = self::OLD_ITEM_TIME;
         $pre_feeds = $this->feeds ?? [];
@@ -284,6 +288,19 @@ class FeedDocumentBundle extends IndexDocumentBundle
                 $word_list, $media_category, $source_name, $lang,
                 $item[self::PUBDATE], $source_stop_regex);
         }
+        if ($just_stats) {
+            $term_stats = [];
+            foreach ($postings as $term => $postings) {
+                $posting_records = $postings_tools->unpack($postings);
+                $term_stats[$term] = count($posting_records);
+            }
+            $statistics = [
+                "NUM_DOCS" => count($doc_map),
+                "NUM_LINKS" => 0,
+                "TERM_STATISTICS" => $term_stats
+            ];
+            return $statistics;
+        }
         unset($term_counts['seen']);
         $this->addTermCountsTrendingTable($term_counts);
         $doc_map_tools->save($doc_map_filename, $doc_map);
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index 45b1011d0..aa11dd305 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -376,8 +376,9 @@ class IndexDocumentBundle implements CrawlConstants
      * partition.
      * @param int $partition to build index for
      * @param string $taking_too_long_touch
-     * @param bool whether to just compute stats on the inverted or to actually
-     *      save the results
+     * @return mixed whether job executed to completion (true or false) if
+     *      !$just_stats, otherwise, an array with NUM_DOCS, NUM_LINKS,
+     *      and TERM_STATISTICS (the latter having term frequency info)
      */
     public function buildInvertedIndexPartition($partition = -1,
         $taking_too_long_touch = null, $just_stats = false)
ViewGit