Apply NUM_DISTINCT_GENERATIONS only to old indexes

Chris Pollett [2024-01-03 05:Jan:rd]
Apply NUM_DISTINCT_GENERATIONS only to old indexes
Filename
src/configs/Config.php
src/library/IndexDocumentBundle.php
src/library/IndexManager.php
src/library/index_bundle_iterators/WordIterator.php
diff --git a/src/configs/Config.php b/src/configs/Config.php
index 537b25f1b..bf1350042 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -951,11 +951,12 @@ nsconddefine('MAX_URL_LEN', 2048);
 nsdefine('PAGE_RANGE_REQUEST', 1000000);
 /**
  * When getting information from an index dictionary in word iterator
- * how many distinct generations to read in in one go
+ * for a  version < 3 index how many distinct generations to read in in one go
  */
 nsconddefine('NUM_DISTINCT_GENERATIONS', 20);
 /**
- * Used in computing the DOC_RANK when a going through index in descending
+ * Used in computing the DOC_RANK for version < 3 indexes when a going
+ * through index in descending
  * fashion.  It represents an upper bound on the maximum number of
  * generations an IndexArchiveBundle should have
  */
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index a40ece3e9..d77492ccf 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -1456,10 +1456,8 @@ class IndexDocumentBundle implements CrawlConstants
      *     dictionary
      * @param int $threshold after the number of results exceeds this amount
      *     stop looking for more dictionary entries.
-     * @param int $start_generation what generation in the index to start
-     *      finding occurrence of phrase from
-     * @param int $num_distinct_generations from $start_generation how
-     *      many generation to search forward to
+     * @param int $offset
+     * @param int $num_partitions
      * @param bool $with_remaining_total whether to total number of
      *      postings found as well or not
      * @return array either [total, sequence of four tuples]
diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php
index b6e80ef94..24a2d90cf 100644
--- a/src/library/IndexManager.php
+++ b/src/library/IndexManager.php
@@ -326,59 +326,6 @@ class IndexManager implements CrawlConstants
         }
         return ($with_remaining_total) ? [$total, $info] : $info;
     }
-    /**
-     * Returns the number of document that a given term or phrase appears in
-     * in the given index where we discount later generation -- those with
-     * lower document rank more
-     *
-     * @param string $term what to look up in the indexes dictionary
-     *     no  mask is used for this look up
-     * @param string $index_name index to look up term or phrase in
-     * @param boolean $discount_terms whether terms should be discounted
-     *      based on their generation or not
-     * @return int number of documents
-     */
-    public static function discountedNumDocsTerm($term, $index_name,
-        $discount_terms = true)
-    {
-        static $num_docs_cache = [];
-        if (isset($num_docs_cache[$index_name][$term])) {
-            return $num_docs_cache[$index_name][$term];
-        }
-        $version = self::getVersion($index_name);
-        $term_id = $discount_terms ? (($version > 2) ? canonicalTerm($term) :
-            crawlHashWord($term, true)) : $term;
-        $word_info = self::getWordInfo($index_name, $term_id, -1, 0,
-            C\NUM_DISTINCT_GENERATIONS);
-        if ($version >= 3 && !empty($word_info)) {
-            $word_info = $word_info['ROWS'];
-        }
-        if (empty($word_info)) {
-            return 0.0;
-        }
-        $total = 0.0;
-        $i = 1;
-        foreach ($word_info as $generation_info) {
-            if ($version < 3) {
-                list($generation, , , $num_docs) = $generation_info;
-            } else {
-                $generation = $generation_info['PARTITION'];
-                $num_docs = $generation_info['NUM_DOCS'];
-            }
-            $discount = $discount_terms ? max($generation + 1, $i++) : 1;
-            $total += $num_docs / $discount;
-        }
-        if (count($num_docs_cache) > 1000) {
-            $num_docs_cache = [];
-        }
-        if (!empty($num_docs_cache[$index_name]) &&
-            count($num_docs_cache[$index_name]) > 10000) {
-            $num_docs_cache[$index_name] = [];
-        }
-        $num_docs_cache[$index_name][$term] = $total;
-        return $total;
-    }
-
     /**
      * Finds posting info related to the most recent version
      * of a URL in the given index
diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php
index 65bb718e6..8d89c1a9d 100644
--- a/src/library/index_bundle_iterators/WordIterator.php
+++ b/src/library/index_bundle_iterators/WordIterator.php
@@ -362,8 +362,8 @@ class WordIterator extends IndexBundleIterator
             $this->empty = ($this->num_generations == 0);
         }
         $this->term_info_computed = true;
-        $this->no_more_generations = ($this->num_generations <
-            C\NUM_DISTINCT_GENERATIONS);
+        $this->no_more_generations = $this->index_version >= 3
+            || count($info) < C\NUM_DISTINCT_GENERATIONS;
     }
     /**
      * Hook function used by currentDocsWithWord to return the current block
@@ -1065,12 +1065,7 @@ class WordIterator extends IndexBundleIterator
                     $index_info = IndexManager::getWordInfo($this->index_name,
                         $this->word_key, 0, $this->num_generations,
                         C\NUM_DISTINCT_GENERATIONS, true);
-                    if ($this->index_version < 3) {
-                        list($estimated_remaining_total, $info) = $index_info;
-                    } else {
-                        $estimated_remaining_total = $index_info['TOTAL_COUNT'];
-                        $info = $index_info["ROWS"];
-                    }
+                    list($estimated_remaining_total, $info) = $index_info;
                     if (count($info) > 0) {
                         $this->num_docs = $this->seen_docs +
                             $estimated_remaining_total;
@@ -1078,8 +1073,8 @@ class WordIterator extends IndexBundleIterator
                         $this->dictionary_info = array_merge(
                             $this->dictionary_info, array_values($info));
                         $this->num_generations = count($this->dictionary_info);
-                        $this->no_more_generations =
-                            count($info) < C\NUM_DISTINCT_GENERATIONS;
+                        $this->no_more_generations = $this->index_version >= 3
+                            || count($info) < C\NUM_DISTINCT_GENERATIONS;
                         //will increment back to where were next loop
                         if ($is_ascending) {
                             $this->generation_pointer--;
ViewGit