diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php index bf8e25e85..e9a816a4b 100644 --- a/src/library/IndexDocumentBundle.php +++ b/src/library/IndexDocumentBundle.php @@ -181,6 +181,16 @@ class IndexDocumentBundle implements CrawlConstants * @var array */ public $last_entries; + /** + * Map from int -> three character unpack string used to unpack posting info + * @var array + */ + public $unpack_map; + /** + * Array of string lengths each of $unpack_maps codes consumes + * @var array + */ + public $unpack_len_map; /** * A string consisting of a concatenated sequence * term position information for each document in turn and within this for @@ -280,6 +290,23 @@ class IndexDocumentBundle implements CrawlConstants "PRIMARY KEY" => ["TERM", 16], "DOC_MAP_INDEX" => "INT", "FREQUENCY" => "INT", "POSITIONS_OFFSET" => "INT", "POSITIONS_LEN" => "INT"], $record_compressor); + $unpack_codes = [0 => "C", 1 => "n", 2=> "N", 3 => "J"]; + $len_codes = [0 => 1, 1 => 2, 2=> 4, 3 => 8]; + for ($i = 0; $i < 4; $i++) { + for ($j = 0; $j < 4; $j++) { + for ($k = 0; $k < 4; $k++) { + for ($m = 0; $m < 4; $m++) { + $this->unpack_map[] = + $unpack_codes[$i] . "DOC_MAP_INDEX/" . + $unpack_codes[$j] . "FREQUENCY/" . + $unpack_codes[$k] . "POSITIONS_OFFSET/" . + $unpack_codes[$m] . "POSITIONS_LEN"; + $this->unpack_len_map[] = $len_codes[$i] + + $len_codes[$j] + $len_codes[$k] + $len_codes[$m]; + } + } + } + } $this->last_entries_tools = new PackedTableTools([ "PRIMARY KEY" => ["TERM", 16], "LAST_INDEX" => "INT", "LAST_OFFSET" => "INT", "NUM_OCCURRENCES" => "INT"], @@ -1177,44 +1204,54 @@ class IndexDocumentBundle implements CrawlConstants return $result; } /** - * Given a table_row, which might represent several items grouped because - * they share a key, unpacks and returns the $offset through $limit numbered - * items - * @param string $postings_string - * @return array unplacked items + * Given the postings as a string for a partition for a term unpacks them + * into an array of postings, doing de-delta of doc_map_indices and + * de-delta of positions. Each posting represents occurrence of a term + * in a documents, so the frequency component is the number of occurrences + * of the term in the document. This method also computes the sum of these + * requencies over all postings in partition. + * + * @param string $postings_string compress string representation of a + * set of postings for a term + * @return array a pair [array of unpacked postings, sum of frequencies + * of all the postings] */ public function unpackPostings($postings_string) { - static $unpack_code = [0 => "C", 1 => "n", 2=> "N", 3 => "J"]; - static $fields = ["DOC_MAP_INDEX", "FREQUENCY", "POSITIONS_OFFSET", - "POSITIONS_LEN"]; + $unpack_map = $this->unpack_map; + $unpack_len_map = $this->unpack_len_map; $current_pos = 0; $num_items = vByteDecode($postings_string, $current_pos); if (empty($postings_string)) { return []; } + $sum_frequencies = 0; + $doc_map_index = 0; + $positions_offset = 0; for ($i = 0; $i < $num_items; $i++) { $int_info = ord($postings_string[$current_pos]); $current_pos++; - $item = []; - $shift = 6; - foreach ($fields as $field) { - $int_code = (($int_info) & (3 << $shift)) >> $shift; - $item[$field] = unpack($unpack_code[$int_code], - $postings_string, $current_pos)[1]; - $current_pos += (1 << $int_code); - $shift -= 2; - } + $pre_item = unpack($unpack_map[$int_info], $postings_string, + $current_pos); + $item = $pre_item; + $item["DOC_MAP_INDEX"] += $doc_map_index; + $item["POSITIONS_OFFSET"] += $positions_offset; + $doc_map_index += $pre_item["DOC_MAP_INDEX"]; + $positions_offset += $pre_item["POSITIONS_OFFSET"]; + $sum_frequencies += $pre_item["FREQUENCY"]; + $current_pos += $unpack_len_map[$int_info]; $items[] = $item; } - return $items; + return [$items, $sum_frequencies]; } /** * Within postings DOC_MAP_INDEX and POSITION_OFFSETS to position lists are * stored as delta lists (difference over previous values), this method * undoes the delta list to restore the actual DELTA_DOC_MAP_INDEX and * POSITION_OFFSETS values. It also computes the of the frequencies of items - * within the list of postings. + * within the list of postings. This method is current only used for + * active partition in an index (the one whose terms haven't yet been added + * to the B+-tree). * * @param array& $postings a reference to an array of posting lists for a * term (this will be changed by this method) diff --git a/src/library/index_bundle_iterators/IndexBundleIterator.php b/src/library/index_bundle_iterators/IndexBundleIterator.php index 9c9b715dc..7dd2dd453 100644 --- a/src/library/index_bundle_iterators/IndexBundleIterator.php +++ b/src/library/index_bundle_iterators/IndexBundleIterator.php @@ -153,38 +153,19 @@ abstract class IndexBundleIterator implements CrawlConstants * a self::ASCEDNING or a self::DESCENDING search * @return int -1,0,1 depending on which is bigger */ - public function genDocOffsetCmp($gen_doc1, $gen_doc2, $direction = + public function genDocOffsetCmp($gen_doc1, $gen_doc2, $direction = self::ASCENDING) - { + { + $diff1 = $gen_doc1[0] - $gen_doc2[0]; + $diff2 = $gen_doc1[1] - $gen_doc2[1]; + if ($diff2 == 0 && $diff1 == $diff2) { + return 0; + } if ($direction == self::ASCENDING) { - //less generation or greater - if ($gen_doc1[0] < $gen_doc2[0]) { - return -1; - } else if ($gen_doc1[0] > $gen_doc2[0]) { - return 1; - } - //less offset or greater - if ($gen_doc1[1] < $gen_doc2[1]) { - return -1; - } else if ($gen_doc1[1] > $gen_doc2[1]) { - return 1; - } + return ($diff1 != 0) ? $diff1 : $diff2; } else { - //less generation or greater for reverse - if ($gen_doc1[0] < $gen_doc2[0]) { - return 1; - } else if ($gen_doc1[0] > $gen_doc2[0]) { - return -1; - } - //less offset or greater for reverse - if ($gen_doc1[1] < $gen_doc2[1]) { - return 1; - } else if ($gen_doc1[1] > $gen_doc2[1]) { - return -1; - } + return ($diff1 != 0) ? -$diff1 : -$diff2; } - //equal - return 0; } /** * Returns the direction of a IndexBundleIterator. Depending on the diff --git a/src/library/index_bundle_iterators/IntersectIterator.php b/src/library/index_bundle_iterators/IntersectIterator.php index 3295874cc..1e98c84c2 100644 --- a/src/library/index_bundle_iterators/IntersectIterator.php +++ b/src/library/index_bundle_iterators/IntersectIterator.php @@ -181,12 +181,12 @@ class IntersectIterator extends IndexBundleIterator */ public function findDocsWithWord() { + static $test_time = 0; $status = $this->syncGenDocOffsetsAmongstIterators(); if ($status == -1) { return -1; } //next we finish computing BM25F - $retrieve_postings_time = microtime(true); $docs = $this->index_bundle_iterators[0]->currentDocsWithWord(); $weight = $this->weight; if (is_array($docs) && count($docs) == 1) { @@ -446,7 +446,6 @@ class IntersectIterator extends IndexBundleIterator if (empty($this->index_bundle_iterators[$i])) { return -1; } - $retrieve_postings_time = microtime(true); if ((($cur_gen_doc_offset = $this->index_bundle_iterators[ $i]->currentGenDocOffsetWithWord()) == -1) || time() > $time_out) { diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php index 3b373e67f..3cb1ac91c 100644 --- a/src/library/index_bundle_iterators/WordIterator.php +++ b/src/library/index_bundle_iterators/WordIterator.php @@ -503,9 +503,15 @@ class WordIterator extends IndexBundleIterator return $key_postings; } /** - * @param int $num_words - * @param array $positions - * @param array $descriptions_scores + * Normalizes the frequencies of a term within a document with respect to + * the length of the document, the positions of the term with the document + * and the overall importance score for a given position within the document + * + * @param int $num_words number of terms in the document + * @param array $positions positions of this iterators term in the document + * @param array $descriptions_scores boundaries and scores of different + * regions with document + * @return float normalized frequency */ public function frequencyNormalization($num_words, $positions, $descriptions_scores) @@ -822,10 +828,16 @@ class WordIterator extends IndexBundleIterator } while($gen_check); } /** - * @param int $generation + * Given a partition number in the the index's PartitionDocumentBundle + * retrieves all the posting for the word iterator's term in that + * partition. + * + * @param int $generation partition to get postings for + * @return array of posting items */ public function getGenerationPostings($generation) { + static $test_time = 0; if ($this->index_version < 3 || empty($this->dictionary_info[$generation])) { return []; @@ -842,9 +854,7 @@ class WordIterator extends IndexBundleIterator $this->archive_file, $generation_info['POSTINGS'], $generation_info['LAST_BLOB_LEN']); } - $postings = $index->unpackPostings($postings_entry); - $index->deDeltaPostingsSumFrequencies( - $postings); + list($postings,) = $index->unpackPostings($postings_entry); $this->dictionary_info[$generation]['POSTINGS'] = $postings; unset($this->dictionary_info[$generation]['LAST_BLOB_LEN']); return $postings; diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index 880ae4f91..c30c96c19 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -1390,8 +1390,10 @@ class PhraseModel extends ParallelModel } $summaries_time = microtime(true); } - // use 2* $num because might have some dedpulcation/robot exclusions - $get_pages = array_slice($pages, $limit, 2 * $num); + /* Use 2* $num because might have some deduplication/robot exclusions + Go from 0 since there may be deduplciation/exclusions before limit + */ + $get_pages = array_slice($pages, 0, $limit + 2 * $num); $to_get_count = count($get_pages); $groups_with_docs = false; if (preg_match("/\bsite:doc\b/", $original_query)) { @@ -1438,7 +1440,7 @@ class PhraseModel extends ParallelModel $results['TOTAL_ROWS'] = ceil( (count($out_pages) * $results['TOTAL_ROWS']) / $cur_limit); } - $out_pages = array_slice($out_pages, 0, $num); + $out_pages = array_slice($out_pages, $limit, $num); if (C\QUERY_STATISTICS) { $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES"); if ($summary_times_string) {