Apply NUM_DISTINCT_GENERATIONS only to old indexes
Apply NUM_DISTINCT_GENERATIONS only to old indexes
diff --git a/src/configs/Config.php b/src/configs/Config.php
index 537b25f1b..bf1350042 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -951,11 +951,12 @@ nsconddefine('MAX_URL_LEN', 2048);
nsdefine('PAGE_RANGE_REQUEST', 1000000);
/**
* When getting information from an index dictionary in word iterator
- * how many distinct generations to read in in one go
+ * for a version < 3 index how many distinct generations to read in in one go
*/
nsconddefine('NUM_DISTINCT_GENERATIONS', 20);
/**
- * Used in computing the DOC_RANK when a going through index in descending
+ * Used in computing the DOC_RANK for version < 3 indexes when a going
+ * through index in descending
* fashion. It represents an upper bound on the maximum number of
* generations an IndexArchiveBundle should have
*/
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index a40ece3e9..d77492ccf 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -1456,10 +1456,8 @@ class IndexDocumentBundle implements CrawlConstants
* dictionary
* @param int $threshold after the number of results exceeds this amount
* stop looking for more dictionary entries.
- * @param int $start_generation what generation in the index to start
- * finding occurrence of phrase from
- * @param int $num_distinct_generations from $start_generation how
- * many generation to search forward to
+ * @param int $offset
+ * @param int $num_partitions
* @param bool $with_remaining_total whether to total number of
* postings found as well or not
* @return array either [total, sequence of four tuples]
diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php
index b6e80ef94..24a2d90cf 100644
--- a/src/library/IndexManager.php
+++ b/src/library/IndexManager.php
@@ -326,59 +326,6 @@ class IndexManager implements CrawlConstants
}
return ($with_remaining_total) ? [$total, $info] : $info;
}
- /**
- * Returns the number of document that a given term or phrase appears in
- * in the given index where we discount later generation -- those with
- * lower document rank more
- *
- * @param string $term what to look up in the indexes dictionary
- * no mask is used for this look up
- * @param string $index_name index to look up term or phrase in
- * @param boolean $discount_terms whether terms should be discounted
- * based on their generation or not
- * @return int number of documents
- */
- public static function discountedNumDocsTerm($term, $index_name,
- $discount_terms = true)
- {
- static $num_docs_cache = [];
- if (isset($num_docs_cache[$index_name][$term])) {
- return $num_docs_cache[$index_name][$term];
- }
- $version = self::getVersion($index_name);
- $term_id = $discount_terms ? (($version > 2) ? canonicalTerm($term) :
- crawlHashWord($term, true)) : $term;
- $word_info = self::getWordInfo($index_name, $term_id, -1, 0,
- C\NUM_DISTINCT_GENERATIONS);
- if ($version >= 3 && !empty($word_info)) {
- $word_info = $word_info['ROWS'];
- }
- if (empty($word_info)) {
- return 0.0;
- }
- $total = 0.0;
- $i = 1;
- foreach ($word_info as $generation_info) {
- if ($version < 3) {
- list($generation, , , $num_docs) = $generation_info;
- } else {
- $generation = $generation_info['PARTITION'];
- $num_docs = $generation_info['NUM_DOCS'];
- }
- $discount = $discount_terms ? max($generation + 1, $i++) : 1;
- $total += $num_docs / $discount;
- }
- if (count($num_docs_cache) > 1000) {
- $num_docs_cache = [];
- }
- if (!empty($num_docs_cache[$index_name]) &&
- count($num_docs_cache[$index_name]) > 10000) {
- $num_docs_cache[$index_name] = [];
- }
- $num_docs_cache[$index_name][$term] = $total;
- return $total;
- }
-
/**
* Finds posting info related to the most recent version
* of a URL in the given index
diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php
index 65bb718e6..8d89c1a9d 100644
--- a/src/library/index_bundle_iterators/WordIterator.php
+++ b/src/library/index_bundle_iterators/WordIterator.php
@@ -362,8 +362,8 @@ class WordIterator extends IndexBundleIterator
$this->empty = ($this->num_generations == 0);
}
$this->term_info_computed = true;
- $this->no_more_generations = ($this->num_generations <
- C\NUM_DISTINCT_GENERATIONS);
+ $this->no_more_generations = $this->index_version >= 3
+ || count($info) < C\NUM_DISTINCT_GENERATIONS;
}
/**
* Hook function used by currentDocsWithWord to return the current block
@@ -1065,12 +1065,7 @@ class WordIterator extends IndexBundleIterator
$index_info = IndexManager::getWordInfo($this->index_name,
$this->word_key, 0, $this->num_generations,
C\NUM_DISTINCT_GENERATIONS, true);
- if ($this->index_version < 3) {
- list($estimated_remaining_total, $info) = $index_info;
- } else {
- $estimated_remaining_total = $index_info['TOTAL_COUNT'];
- $info = $index_info["ROWS"];
- }
+ list($estimated_remaining_total, $info) = $index_info;
if (count($info) > 0) {
$this->num_docs = $this->seen_docs +
$estimated_remaining_total;
@@ -1078,8 +1073,8 @@ class WordIterator extends IndexBundleIterator
$this->dictionary_info = array_merge(
$this->dictionary_info, array_values($info));
$this->num_generations = count($this->dictionary_info);
- $this->no_more_generations =
- count($info) < C\NUM_DISTINCT_GENERATIONS;
+ $this->no_more_generations = $this->index_version >= 3
+ || count($info) < C\NUM_DISTINCT_GENERATIONS;
//will increment back to where were next loop
if ($is_ascending) {
$this->generation_pointer--;