Add more sanity checks for IndexDocumentBundle::unpackPostings
Add more sanity checks for IndexDocumentBundle::unpackPostings
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index 2faed10e5..a40ece3e9 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -1614,28 +1614,39 @@ class IndexDocumentBundle implements CrawlConstants
$len_posting_strings = strlen($postings_string);
for ($i = 0; $i < $num_items; $i++) {
if (!isset($postings_string[$current_pos])) {
- // crawlLog("Posting decode error");
- // crawlLog("..Number to decode items: " . $num_items);
- // crawlLog("..Number decoded: " . $i);
- // crawlLog("..Length posting string: " .
- // strlen($postings_string));
- // crawlLog("..Current position: " . $current_pos);
- return []; // sanity check 1
+ crawlLog("Posting decode error");
+ crawlLog("..Number to decode items: " . $num_items);
+ crawlLog("..Number decoded: " . $i);
+ crawlLog("..Length posting string: " .
+ strlen($postings_string));
+ crawlLog("..Current position: " . $current_pos);
+ return [$items, $sum_frequencies]; // sanity check 1
}
$int_info = ord($postings_string[$current_pos]);
$current_pos++;
$len_unpack_info = $unpack_len_map[$int_info];
if ($current_pos + $len_unpack_info > $len_posting_strings) {
- // crawlLog("Posting decode error");
- // crawlLog("..Number to decode items: " . $num_items);
- // crawlLog("..Number decoded: " . $i);
- // crawlLog("..Length posting string: " .
- // strlen($postings_string));
- // crawlLog("..Current position: " . $current_pos);
- return []; // sanity check 2
+ crawlLog("Posting decode error");
+ crawlLog("..Number to decode items: " . $num_items);
+ crawlLog("..Number decoded: " . $i);
+ crawlLog("..Length posting string: " .
+ strlen($postings_string));
+ crawlLog("..Current position: " . $current_pos);
+ return [$items, $sum_frequencies]; // sanity check 2
}
$pre_item = unpack($unpack_map[$int_info], $postings_string,
$current_pos);
+ if ($pre_item["FREQUENCY"] > C\MAX_DESCRIPTION_LEN) {
+ crawlLog("Posting decode error! Frequency too large");
+ crawlLog("..Number to decode items: " . $num_items);
+ crawlLog("..Number decoded: " . $i);
+ crawlLog("..Length posting string: " .
+ strlen($postings_string));
+ crawlLog("..Current position: " . $current_pos);
+ crawlLog("..Large Frequency Observed: ".
+ $pre_item["FREQUENCY"] . " ". C\MAX_DESCRIPTION_LEN);
+ return [$items, $sum_frequencies]; // sanity check 3
+ }
$item = $pre_item;
$item["DOC_MAP_INDEX"] += $doc_map_index;
$item["POSITIONS_OFFSET"] += $positions_offset;
diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php
index 0b35c4a89..65bb718e6 100644
--- a/src/library/index_bundle_iterators/WordIterator.php
+++ b/src/library/index_bundle_iterators/WordIterator.php
@@ -510,7 +510,6 @@ class WordIterator extends IndexBundleIterator
} else {
$position_list = [];
}
-
return $position_list;
}
/**
@@ -801,12 +800,12 @@ class WordIterator extends IndexBundleIterator
$first_index = $mid_index;
}
}
- $weight = $descriptions_scores[$first_index]['SCORE'];;
+ $weight = $descriptions_scores[$first_index]['SCORE'];
$start_description_pos = $descriptions_scores[$first_index]['POS'];
- $len_description = ($first_index == $num_scores - 1) ?
+ $len_description = max(abs(($first_index == $num_scores - 1) ?
$pseudo_doc_length - $start_description_pos :
$descriptions_scores[$first_index + 1]['POS'] -
- $start_description_pos;
+ $start_description_pos), $len_term, 1);
$frequency_term = $weight * $len_term / $len_description;
if ($position <= 0) {
$bonuses += $weight; //$frequency_term;