Add more sanity checks for IndexDocumentBundle::unpackPostings

Chris Pollett [2024-01-01 19:Jan:st]
Add more sanity checks for IndexDocumentBundle::unpackPostings
Filename
src/library/IndexDocumentBundle.php
src/library/index_bundle_iterators/WordIterator.php
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index 2faed10e5..a40ece3e9 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -1614,28 +1614,39 @@ class IndexDocumentBundle implements CrawlConstants
         $len_posting_strings = strlen($postings_string);
         for ($i = 0; $i < $num_items; $i++) {
             if (!isset($postings_string[$current_pos])) {
-                // crawlLog("Posting decode error");
-                // crawlLog("..Number to decode items: " . $num_items);
-                // crawlLog("..Number decoded: " . $i);
-                // crawlLog("..Length posting string: " .
-                //     strlen($postings_string));
-                // crawlLog("..Current position: " . $current_pos);
-                return []; // sanity check 1
+                 crawlLog("Posting decode error");
+                 crawlLog("..Number to decode items: " . $num_items);
+                 crawlLog("..Number decoded: " . $i);
+                 crawlLog("..Length posting string: " .
+                     strlen($postings_string));
+                crawlLog("..Current position: " . $current_pos);
+                return [$items, $sum_frequencies]; // sanity check 1
             }
             $int_info = ord($postings_string[$current_pos]);
             $current_pos++;
             $len_unpack_info = $unpack_len_map[$int_info];
             if ($current_pos + $len_unpack_info > $len_posting_strings) {
-                // crawlLog("Posting decode error");
-                // crawlLog("..Number to decode items: " . $num_items);
-                // crawlLog("..Number decoded: " . $i);
-                // crawlLog("..Length posting string: " .
-                //     strlen($postings_string));
-                // crawlLog("..Current position: " . $current_pos);
-                return []; // sanity check 2
+                crawlLog("Posting decode error");
+                crawlLog("..Number to decode items: " . $num_items);
+                crawlLog("..Number decoded: " . $i);
+                crawlLog("..Length posting string: " .
+                    strlen($postings_string));
+                crawlLog("..Current position: " . $current_pos);
+                return [$items, $sum_frequencies]; // sanity check 2
             }
             $pre_item = unpack($unpack_map[$int_info], $postings_string,
                 $current_pos);
+            if ($pre_item["FREQUENCY"] > C\MAX_DESCRIPTION_LEN) {
+                crawlLog("Posting decode error! Frequency too large");
+                crawlLog("..Number to decode items: " . $num_items);
+                crawlLog("..Number decoded: " . $i);
+                crawlLog("..Length posting string: " .
+                    strlen($postings_string));
+                crawlLog("..Current position: " . $current_pos);
+                crawlLog("..Large Frequency Observed: ".
+                    $pre_item["FREQUENCY"] . " ". C\MAX_DESCRIPTION_LEN);
+                return [$items, $sum_frequencies]; // sanity check 3
+            }
             $item = $pre_item;
             $item["DOC_MAP_INDEX"] += $doc_map_index;
             $item["POSITIONS_OFFSET"] += $positions_offset;
diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php
index 0b35c4a89..65bb718e6 100644
--- a/src/library/index_bundle_iterators/WordIterator.php
+++ b/src/library/index_bundle_iterators/WordIterator.php
@@ -510,7 +510,6 @@ class WordIterator extends IndexBundleIterator
         } else {
             $position_list = [];
         }
-
         return $position_list;
     }
     /**
@@ -801,12 +800,12 @@ class WordIterator extends IndexBundleIterator
                     $first_index = $mid_index;
                 }
             }
-            $weight = $descriptions_scores[$first_index]['SCORE'];;
+            $weight = $descriptions_scores[$first_index]['SCORE'];
             $start_description_pos = $descriptions_scores[$first_index]['POS'];
-            $len_description = ($first_index == $num_scores - 1) ?
+            $len_description = max(abs(($first_index == $num_scores - 1) ?
                 $pseudo_doc_length - $start_description_pos :
                 $descriptions_scores[$first_index + 1]['POS'] -
-                $start_description_pos;
+                $start_description_pos), $len_term, 1);
             $frequency_term = $weight * $len_term / $len_description;
             if ($position <= 0) {
                 $bonuses += $weight; //$frequency_term;
ViewGit