add a just_stats option for FeedDocumentBundle, a=chris
add a just_stats option for FeedDocumentBundle, a=chris
diff --git a/src/library/FeedDocumentBundle.php b/src/library/FeedDocumentBundle.php
index c8b26bfd8..a672fd70c 100644
--- a/src/library/FeedDocumentBundle.php
+++ b/src/library/FeedDocumentBundle.php
@@ -172,10 +172,14 @@ class FeedDocumentBundle extends IndexDocumentBundle
* inverted index takes too long (whether SCHEDULE_DIR/crawl_status.txt)
* has been recently modified is used in crawling to see if have run out
* of new data and the crawl can stopped.
- * @return bool whether job executed to complete
+ * @param bool $just_stats whether to just compute stats on the inverted
+ * or to actually save the results
+ * @return mixed whether job executed to completion (true or false) if
+ * !$just_stats, otherwise, an array with NUM_DOCS, NUM_LINKS,
+ * and TERM_STATISTICS (the latter having term frequency info)
*/
public function buildInvertedIndexPartition($partition = -1,
- $taking_too_long_touch = null)
+ $taking_too_long_touch = null, $just_stats = false)
{
$age = self::OLD_ITEM_TIME;
$pre_feeds = $this->feeds ?? [];
@@ -284,6 +288,19 @@ class FeedDocumentBundle extends IndexDocumentBundle
$word_list, $media_category, $source_name, $lang,
$item[self::PUBDATE], $source_stop_regex);
}
+ if ($just_stats) {
+ $term_stats = [];
+ foreach ($postings as $term => $postings) {
+ $posting_records = $postings_tools->unpack($postings);
+ $term_stats[$term] = count($posting_records);
+ }
+ $statistics = [
+ "NUM_DOCS" => count($doc_map),
+ "NUM_LINKS" => 0,
+ "TERM_STATISTICS" => $term_stats
+ ];
+ return $statistics;
+ }
unset($term_counts['seen']);
$this->addTermCountsTrendingTable($term_counts);
$doc_map_tools->save($doc_map_filename, $doc_map);
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index 45b1011d0..aa11dd305 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -376,8 +376,9 @@ class IndexDocumentBundle implements CrawlConstants
* partition.
* @param int $partition to build index for
* @param string $taking_too_long_touch
- * @param bool whether to just compute stats on the inverted or to actually
- * save the results
+ * @return mixed whether job executed to completion (true or false) if
+ * !$just_stats, otherwise, an array with NUM_DOCS, NUM_LINKS,
+ * and TERM_STATISTICS (the latter having term frequency info)
*/
public function buildInvertedIndexPartition($partition = -1,
$taking_too_long_touch = null, $just_stats = false)