Optimizations to IndexDocumentBundle::unpackPostings, tweaks to limits of getSummariesByHash, adds more documentation, a=chris

Chris Pollett [2022-07-16 23:Jul:th]
Optimizations to IndexDocumentBundle::unpackPostings, tweaks to limits of getSummariesByHash, adds more documentation, a=chris
Filename
src/library/IndexDocumentBundle.php
src/library/index_bundle_iterators/IndexBundleIterator.php
src/library/index_bundle_iterators/IntersectIterator.php
src/library/index_bundle_iterators/WordIterator.php
src/models/PhraseModel.php
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index bf8e25e85..e9a816a4b 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -181,6 +181,16 @@ class IndexDocumentBundle implements CrawlConstants
      * @var array
      */
     public $last_entries;
+    /**
+     * Map from int -> three character unpack string used to unpack posting info
+     * @var array
+     */
+    public $unpack_map;
+    /**
+     * Array of string lengths each of $unpack_maps codes consumes
+     * @var array
+     */
+    public $unpack_len_map;
     /**
      * A string consisting of a concatenated sequence
      * term position information for each document in turn and within this for
@@ -280,6 +290,23 @@ class IndexDocumentBundle implements CrawlConstants
             "PRIMARY KEY" => ["TERM", 16], "DOC_MAP_INDEX" => "INT",
                 "FREQUENCY" => "INT", "POSITIONS_OFFSET" => "INT",
                 "POSITIONS_LEN" => "INT"], $record_compressor);
+        $unpack_codes = [0 => "C", 1 => "n", 2=> "N", 3 => "J"];
+        $len_codes = [0 => 1, 1 => 2, 2=> 4, 3 => 8];
+        for ($i = 0; $i < 4; $i++) {
+            for ($j = 0; $j < 4; $j++) {
+                for ($k = 0; $k < 4; $k++) {
+                    for ($m = 0; $m < 4; $m++) {
+                        $this->unpack_map[] =
+                            $unpack_codes[$i] . "DOC_MAP_INDEX/" .
+                            $unpack_codes[$j] . "FREQUENCY/" .
+                            $unpack_codes[$k] . "POSITIONS_OFFSET/" .
+                            $unpack_codes[$m] . "POSITIONS_LEN";
+                        $this->unpack_len_map[] = $len_codes[$i] +
+                            $len_codes[$j] + $len_codes[$k] + $len_codes[$m];
+                    }
+                }
+            }
+        }
         $this->last_entries_tools = new PackedTableTools([
             "PRIMARY KEY" => ["TERM", 16], "LAST_INDEX" => "INT",
             "LAST_OFFSET" => "INT", "NUM_OCCURRENCES" => "INT"],
@@ -1177,44 +1204,54 @@ class IndexDocumentBundle implements CrawlConstants
         return $result;
     }
     /**
-     * Given a table_row, which might represent several items grouped because
-     * they share a key, unpacks and returns the $offset through $limit numbered
-     * items
-     * @param string $postings_string
-     * @return array unplacked items
+     * Given the postings as a string for a partition for a term unpacks them
+     * into an array of postings, doing de-delta of doc_map_indices and
+     * de-delta of positions. Each posting represents occurrence of a term
+     * in a documents, so the frequency component  is the number of occurrences
+     * of the term in the document. This method also computes the sum of these
+     * requencies over all postings in partition.
+     *
+     * @param string $postings_string compress string representation of a
+     *   set of postings for a term
+     * @return array a pair [array of unpacked postings, sum of frequencies
+     *   of all the postings]
      */
     public function unpackPostings($postings_string)
     {
-        static $unpack_code = [0 => "C", 1 => "n", 2=> "N", 3 => "J"];
-        static $fields = ["DOC_MAP_INDEX", "FREQUENCY", "POSITIONS_OFFSET",
-            "POSITIONS_LEN"];
+        $unpack_map = $this->unpack_map;
+        $unpack_len_map = $this->unpack_len_map;
         $current_pos = 0;
         $num_items = vByteDecode($postings_string, $current_pos);
         if (empty($postings_string)) {
             return [];
         }
+        $sum_frequencies = 0;
+        $doc_map_index = 0;
+        $positions_offset = 0;
         for ($i = 0; $i < $num_items; $i++) {
             $int_info = ord($postings_string[$current_pos]);
             $current_pos++;
-            $item = [];
-            $shift = 6;
-            foreach ($fields as $field) {
-                $int_code = (($int_info) & (3 << $shift)) >> $shift;
-                $item[$field] = unpack($unpack_code[$int_code],
-                    $postings_string, $current_pos)[1];
-                $current_pos += (1 << $int_code);
-                $shift -= 2;
-            }
+            $pre_item = unpack($unpack_map[$int_info], $postings_string,
+                $current_pos);
+            $item = $pre_item;
+            $item["DOC_MAP_INDEX"] += $doc_map_index;
+            $item["POSITIONS_OFFSET"] += $positions_offset;
+            $doc_map_index += $pre_item["DOC_MAP_INDEX"];
+            $positions_offset += $pre_item["POSITIONS_OFFSET"];
+            $sum_frequencies += $pre_item["FREQUENCY"];
+            $current_pos += $unpack_len_map[$int_info];
             $items[] = $item;
         }
-        return $items;
+        return [$items, $sum_frequencies];
     }
     /**
      * Within postings DOC_MAP_INDEX and POSITION_OFFSETS to position lists are
      * stored as delta lists (difference over previous values), this method
      * undoes the delta list to restore the actual DELTA_DOC_MAP_INDEX and
      * POSITION_OFFSETS values. It also computes the of the frequencies of items
-     * within the list of postings.
+     * within the list of postings. This method is current only used for
+     * active partition in an index (the one whose terms haven't yet been added
+     * to the B+-tree).
      *
      * @param array& $postings a reference to an array of posting lists for a
      *  term (this will be changed by this method)
diff --git a/src/library/index_bundle_iterators/IndexBundleIterator.php b/src/library/index_bundle_iterators/IndexBundleIterator.php
index 9c9b715dc..7dd2dd453 100644
--- a/src/library/index_bundle_iterators/IndexBundleIterator.php
+++ b/src/library/index_bundle_iterators/IndexBundleIterator.php
@@ -153,38 +153,19 @@ abstract class IndexBundleIterator implements CrawlConstants
      *  a self::ASCEDNING or a self::DESCENDING search
      * @return int -1,0,1 depending on which is bigger
      */
-     public function genDocOffsetCmp($gen_doc1, $gen_doc2, $direction =
+    public function genDocOffsetCmp($gen_doc1, $gen_doc2, $direction =
         self::ASCENDING)
-     {
+    {
+        $diff1 = $gen_doc1[0] - $gen_doc2[0];
+        $diff2 = $gen_doc1[1] - $gen_doc2[1];
+        if ($diff2 == 0 && $diff1 == $diff2) {
+            return 0;
+        }
         if ($direction == self::ASCENDING) {
-            //less generation or greater
-            if ($gen_doc1[0] < $gen_doc2[0]) {
-                return -1;
-            } else if ($gen_doc1[0] > $gen_doc2[0]) {
-                return 1;
-            }
-            //less offset or greater
-            if ($gen_doc1[1] < $gen_doc2[1]) {
-                return -1;
-            } else if ($gen_doc1[1] > $gen_doc2[1]) {
-                return 1;
-            }
+            return ($diff1 != 0) ? $diff1 : $diff2;
         } else {
-            //less generation or greater for reverse
-            if ($gen_doc1[0] < $gen_doc2[0]) {
-                return 1;
-            } else if ($gen_doc1[0] > $gen_doc2[0]) {
-                return -1;
-            }
-            //less offset or greater for reverse
-            if ($gen_doc1[1] < $gen_doc2[1]) {
-                return 1;
-            } else if ($gen_doc1[1] > $gen_doc2[1]) {
-                return -1;
-            }
+            return ($diff1 != 0) ? -$diff1 : -$diff2;
         }
-        //equal
-        return 0;
     }
     /**
      * Returns the direction of a IndexBundleIterator. Depending on the
diff --git a/src/library/index_bundle_iterators/IntersectIterator.php b/src/library/index_bundle_iterators/IntersectIterator.php
index 3295874cc..1e98c84c2 100644
--- a/src/library/index_bundle_iterators/IntersectIterator.php
+++ b/src/library/index_bundle_iterators/IntersectIterator.php
@@ -181,12 +181,12 @@ class IntersectIterator extends IndexBundleIterator
      */
     public function findDocsWithWord()
     {
+        static $test_time = 0;
         $status = $this->syncGenDocOffsetsAmongstIterators();
         if ($status == -1) {
             return -1;
         }
         //next we finish computing BM25F
-        $retrieve_postings_time = microtime(true);
         $docs = $this->index_bundle_iterators[0]->currentDocsWithWord();
         $weight = $this->weight;
         if (is_array($docs) && count($docs) == 1) {
@@ -446,7 +446,6 @@ class IntersectIterator extends IndexBundleIterator
             if (empty($this->index_bundle_iterators[$i])) {
                 return -1;
             }
-            $retrieve_postings_time = microtime(true);
             if ((($cur_gen_doc_offset = $this->index_bundle_iterators[
                 $i]->currentGenDocOffsetWithWord()) == -1) ||
                 time() > $time_out) {
diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php
index 3b373e67f..3cb1ac91c 100644
--- a/src/library/index_bundle_iterators/WordIterator.php
+++ b/src/library/index_bundle_iterators/WordIterator.php
@@ -503,9 +503,15 @@ class WordIterator extends IndexBundleIterator
         return $key_postings;
     }
     /**
-     * @param int $num_words
-     * @param array $positions
-     * @param array $descriptions_scores
+     * Normalizes the frequencies of a term within a document with respect to
+     * the length of the document, the positions of the term with the document
+     * and the overall importance score for a given position within the document
+     *
+     * @param int $num_words number of terms in the document
+     * @param array $positions positions of this iterators term in the document
+     * @param array $descriptions_scores boundaries and scores of different
+     *  regions with document
+     * @return float normalized frequency
      */
     public function frequencyNormalization($num_words, $positions,
         $descriptions_scores)
@@ -822,10 +828,16 @@ class WordIterator extends IndexBundleIterator
         } while($gen_check);
     }
     /**
-     * @param int $generation
+     * Given a partition number in the the index's PartitionDocumentBundle
+     * retrieves all the posting for the word iterator's term in that
+     * partition.
+     *
+     * @param int $generation partition to get  postings for
+     * @return array of posting items
      */
     public function getGenerationPostings($generation)
     {
+        static $test_time = 0;
         if ($this->index_version < 3 ||
             empty($this->dictionary_info[$generation])) {
             return [];
@@ -842,9 +854,7 @@ class WordIterator extends IndexBundleIterator
                 $this->archive_file, $generation_info['POSTINGS'],
                 $generation_info['LAST_BLOB_LEN']);
         }
-        $postings = $index->unpackPostings($postings_entry);
-        $index->deDeltaPostingsSumFrequencies(
-            $postings);
+        list($postings,) = $index->unpackPostings($postings_entry);
         $this->dictionary_info[$generation]['POSTINGS'] = $postings;
         unset($this->dictionary_info[$generation]['LAST_BLOB_LEN']);
         return $postings;
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index 880ae4f91..c30c96c19 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -1390,8 +1390,10 @@ class PhraseModel extends ParallelModel
             }
             $summaries_time = microtime(true);
         }
-        // use 2* $num because might have some dedpulcation/robot exclusions
-        $get_pages = array_slice($pages, $limit, 2 * $num);
+        /* Use 2* $num because might have some deduplication/robot exclusions
+           Go from 0 since there may be deduplciation/exclusions before limit
+         */
+        $get_pages = array_slice($pages, 0, $limit + 2 * $num);
         $to_get_count = count($get_pages);
         $groups_with_docs = false;
         if (preg_match("/\bsite:doc\b/", $original_query)) {
@@ -1438,7 +1440,7 @@ class PhraseModel extends ParallelModel
             $results['TOTAL_ROWS'] = ceil(
                 (count($out_pages) * $results['TOTAL_ROWS']) / $cur_limit);
         }
-        $out_pages = array_slice($out_pages, 0, $num);
+        $out_pages = array_slice($out_pages, $limit, $num);
         if (C\QUERY_STATISTICS) {
             $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES");
             if ($summary_times_string) {
ViewGit