Improve estimates of num results

Chris Pollett [2024-01-01 01:Jan:st]
Improve estimates of num results
Filename
src/library/index_bundle_iterators/IntersectIterator.php
src/library/index_bundle_iterators/UnionIterator.php
src/models/PhraseModel.php
diff --git a/src/library/index_bundle_iterators/IntersectIterator.php b/src/library/index_bundle_iterators/IntersectIterator.php
index 857c9eefb..c5426849a 100644
--- a/src/library/index_bundle_iterators/IntersectIterator.php
+++ b/src/library/index_bundle_iterators/IntersectIterator.php
@@ -52,6 +52,10 @@ class IntersectIterator extends IndexBundleIterator
      * @var array
      */
     public $index_bundle_iterators;
+    /**
+     * @var int
+     */
+    public $max_solution_docs;
     /**
      * Number of elements in $this->index_bundle_iterators
      * @var int
@@ -136,6 +140,7 @@ class IntersectIterator extends IndexBundleIterator
         $this->num_words = count($word_iterator_map);
         $this->num_iterators = count($index_bundle_iterators);
         $this->num_docs = 40000000000; // a really big number
+        $this->max_solution_docs = 40000000000;
         $this->quote_positions = $quote_positions;
         $this->weight = $weight;
         $this->results_per_block = 1;
@@ -160,8 +165,11 @@ class IntersectIterator extends IndexBundleIterator
             }
             $this->index_bundle_iterators[$i]->setResultsPerBlock(1);
         }
-        $this->total_num_docs = $this->index_bundle_iterators[0]->total_num_docs
-            ?? $this->num_docs;
+        $this->total_num_docs =
+            $this->index_bundle_iterators[0]->total_num_docs ?? $this->num_docs;
+        $this->max_solution_docs =
+            $this->index_bundle_iterators[$this->least_num_doc_index
+            ]->num_docs ?? $this->num_docs;
     }
     /**
      * Returns the iterators to the first document block that it could iterate
@@ -420,13 +428,13 @@ class IntersectIterator extends IndexBundleIterator
     public function advance($gen_doc_offset = null)
     {
         $this->current_block_fresh = false;
-        $this->seen_docs += 1;
         $i = $this->least_num_doc_index;
         $this->seen_docs_unfiltered =
             $this->index_bundle_iterators[$i]->seen_docs;
         if ($this->seen_docs_unfiltered > 0) {
+            $this->seen_docs += 1;
             $this->num_docs =
-                floor(($this->seen_docs * $this->total_num_docs) /
+                floor(($this->seen_docs * $this->max_solution_docs) /
                 $this->seen_docs_unfiltered);
         }
         $this->index_bundle_iterators[0]->advance($gen_doc_offset);
diff --git a/src/library/index_bundle_iterators/UnionIterator.php b/src/library/index_bundle_iterators/UnionIterator.php
index 069118087..538e4fd47 100644
--- a/src/library/index_bundle_iterators/UnionIterator.php
+++ b/src/library/index_bundle_iterators/UnionIterator.php
@@ -47,19 +47,20 @@ class UnionIterator extends IndexBundleIterator
      * @var array
      */
     public $index_bundle_iterators;
+    /**
+     * @var int
+     */
+    public $max_solution_docs;
     /**
      * Number of elements in $this->index_bundle_iterators
      * @var int
      */
     public $num_iterators;
     /**
-     * The number of documents in the current block before filtering
-     * by restricted words
      * @var int
      */
     public $count_block_unfiltered;
     /**
-     * The number of iterated docs before the restriction test
      * @var int
      */
     public $seen_docs_unfiltered;
@@ -68,21 +69,15 @@ class UnionIterator extends IndexBundleIterator
      * @var string
      */
     public $index_name;
-    /**
-     * The total count of indexed documents in the current index
-     * @var int
-     */
-    public $total_num_docs;
     /**
      * Creates a union iterator with the given parameters.
      *
      * @param object $index_bundle_iterators to use as a source of documents
      *     to iterate over
      * @param string $index_name time_stamp of the index to use
-     * @param int $total_num_docs total number of documents in the current index
      */
     public function __construct($index_bundle_iterators,
-        $index_name, $total_num_docs)
+        $index_name)
     {
         /*
             estimate number of results by sum of all iterator counts,
@@ -100,24 +95,17 @@ class UnionIterator extends IndexBundleIterator
         $this->seen_docs = 0;
         $this->seen_docs_unfiltered = 0;
         $this->index_name = $index_name;
-        $this->total_num_docs = $total_num_docs;
         $num_smaller = array_fill(0, $num_iterators, 0);
+        $this->max_solution_docs = 0;;
         for ($i = 0; $i < $num_iterators; $i++) {
             $index_bundle_iterators[$i]->setResultsPerBlock(1);
             $num_docs = $index_bundle_iterators[$i]->num_docs;
-            $this->num_docs += $num_docs;
+            $this->max_solution_docs += $num_docs;
             for ($j = 0; $j < $i; $j++) {
                 if ($num_docs < $index_bundle_iterators[$j]->num_docs) {
                     $num_smaller[$j]++;
                 }
             }
-            $this->seen_docs += $index_bundle_iterators[$i]->seen_docs;
-            if (isset($this->index_bundle_iterators[$i]->seen_docs_unfiltered)){
-                $this->seen_docs_unfiltered +=
-                    $this->index_bundle_iterators[$i]->seen_docs_unfiltered;
-            } else {
-                $this->seen_docs_unfiltered += $this->seen_docs;
-            }
         }
         asort($num_smaller);
         $i = 0;
@@ -305,7 +293,7 @@ class UnionIterator extends IndexBundleIterator
         $this->seen_docs += $this->count_block;
         $this->seen_docs_unfiltered += $this->count_block_unfiltered;
         $this->num_docs =
-            floor(($this->seen_docs * $this->total_num_docs) /
+            floor(($this->seen_docs * $this->max_solution_docs) /
             $this->seen_docs_unfiltered);
         if ($gen_doc_offset != null) {
             foreach ($this->index_bundle_iterators as $iterator) {
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index d1f99df9a..cdb3c075f 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -2002,7 +2002,7 @@ class PhraseModel extends ParallelModel
             $index = IndexManager::getIndex($actual_index_name);
             $index_info = $index->getArchiveInfo($index->dir_name);
             $union_iterator = new I\UnionIterator($iterators,
-                $actual_index_name, $index_info['VISITED_URLS_COUNT']);
+                $actual_index_name);
         }
         $raw = intval($raw);
         if ($raw > 0) {
ViewGit