diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index 42b09abd3..880ae4f91 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -1241,6 +1241,11 @@ class PhraseModel extends ParallelModel
$num_retrieved = count($pages);
}
}
+ /*
+ at this pount count($pages) >= $to_retrieve or there were
+ less than $to_retrieve docs. Notice it may the case
+ count($pages) is larger than $to_retrieve
+ */
$retrieve_postings_time = L\changeInMicrotime($retrieve_postings_time);
if ($save_timestamp_name != "" && ($queue_servers == [] ||
$this->isSingleLocalhost($queue_servers))) {
@@ -1385,14 +1390,15 @@ class PhraseModel extends ParallelModel
}
$summaries_time = microtime(true);
}
- $get_pages = array_slice($pages, $start_slice, $to_retrieve);
+ // use 2* $num because might have some dedpulcation/robot exclusions
+ $get_pages = array_slice($pages, $limit, 2 * $num);
$to_get_count = count($get_pages);
$groups_with_docs = false;
if (preg_match("/\bsite:doc\b/", $original_query)) {
$groups_with_docs = true;
}
$out_pages = [];
- $cur_limit = $start_slice;
+ $cur_limit = $limit;
$with_qa = (preg_match("/\bqqq\b/i", $original_query)) ? true : false;
// now calculate snippet length
$description_length = self::DEFAULT_DESCRIPTION_LENGTH;
@@ -1409,7 +1415,11 @@ class PhraseModel extends ParallelModel
}
}
}
- while (count($out_pages) < $to_get_count && $get_pages) {
+ /* this is a loop because getSummariesFromOffsets might eliminate some
+ entries forbidden by robot directives or by deduplication
+ */
+ $count_out_pages = 0;
+ while ($count_out_pages < $to_get_count && $get_pages) {
$out_pages = array_merge($out_pages,
$this->getSummariesFromOffsets($get_pages, $queue_servers,
$raw, $groups_with_docs, $with_qa, $format_words,
@@ -1417,9 +1427,10 @@ class PhraseModel extends ParallelModel
if ($save_timestamp_name != "") {
break;
}
- $cur_limit += C\MIN_RESULTS_TO_GROUP;
- $get_pages = array_slice($pages, $cur_limit,
- C\MIN_RESULTS_TO_GROUP);
+ $cur_limit += $to_get_count;
+ $count_out_pages = count($out_pages);
+ $to_get_count = 2 * ($to_get_count - $count_out_pages);
+ $get_pages = array_slice($pages, $cur_limit, $to_get_count);
}
if ($cur_limit > $results['TOTAL_ROWS']) {
$results['TOTAL_ROWS'] = count($out_pages);
@@ -1427,7 +1438,7 @@ class PhraseModel extends ParallelModel
$results['TOTAL_ROWS'] = ceil(
(count($out_pages) * $results['TOTAL_ROWS']) / $cur_limit);
}
- $out_pages = array_slice($out_pages, $limit, $num);
+ $out_pages = array_slice($out_pages, 0, $num);
if (C\QUERY_STATISTICS) {
$summary_times_string = AnalyticsManager::get("SUMMARY_TIMES");
if ($summary_times_string) {
@@ -1583,31 +1594,38 @@ class PhraseModel extends ParallelModel
$link_summaries);
$out_pages = [];
$seen_keys = [];
- /* insert lookup values back into pages, do deduplication
- Handle robot meta tags present on page
+ $seen_descriptions = [];
+ /* insert lookup values back into pages, dedeup,
+ handle robot meta tags present on page
*/
$base_dir = C\APP_DIR . "/resources";
$i = 0;
$thumb_sites = [];
foreach ($pages as $page) {
$key = $page[self::KEY];
+ $add_page = false;
if (isset($summaries[$key]) &&
- !in_array($key, $seen_keys)) {
- $summary = & $summaries[$key];
- $seen_keys[] = $key;
- $pre_page = array_merge($page, $summary);
- if (isset($pre_page[self::ROBOT_METAS])) {
- if (!in_array("NOINDEX", $pre_page[self::ROBOT_METAS])
- &&
- !in_array("NONE", $pre_page[self::ROBOT_METAS])) {
+ !isset($seen_keys[$key])) {
+ $seen_keys[$key] = true;
+ $hash_description =
+ L\crawlHash($summaries[$key][self::DESCRIPTION] ?? "");
+ if (isset($seen_descriptions[$hash_description])) {
+ continue;
+ }
+ $seen_descriptions[$hash_description] = true;
+ $metas = $summaries[$key][self::ROBOT_METAS] ?? [];
+ if (!empty($metas)) {
+ if (!in_array("NOINDEX", $metas) &&
+ !in_array("NONE", $metas)) {
$add_page = true;
}
} else {
$add_page = true;
}
if (!empty($add_page)) {
- if (!empty($pre_page[self::IMAGE_LINK])) {
- $image_link = $pre_page[self::IMAGE_LINK];
+ $out_pages[$i] = array_merge($page, $summaries[$key]);
+ if (!empty($out_pages[$i][self::IMAGE_LINK])) {
+ $image_link = $out_pages[$i][self::IMAGE_LINK];
$link_scheme = substr($image_link, 0, 7);
if ($link_scheme == "feed://") {
$image_link_parts = explode("/", $image_link);
@@ -1631,34 +1649,18 @@ class PhraseModel extends ParallelModel
if (file_exists($image_path)) {
$image_string = $this->fileGetContents(
$image_path);
- $pre_page[self::IMAGE_LINK] =
+ $out_pages[$i][self::IMAGE_LINK] =
'data:image/jpeg;base64,' .
base64_encode($image_string);
} else {
- unset($pre_page[self::IMAGE_LINK]);
+ unset($out_pages[$i][self::IMAGE_LINK]);
}
}
}
- $out_pages[$i] = $pre_page;
$i++;
}
}
}
- $cnt = count($out_pages);
- $seen_descriptions = [];
- for ($i = 0; $i < $cnt; $i++) {
- if (($groups_with_docs && (empty($out_pages[$i][self::IS_DOC]) ||
- !empty($out_pages[$i][self::LOCATION]))) ||
- in_array($out_pages[$i][self::DESCRIPTION],
- $seen_descriptions)) {
- unset($out_pages[$i]);
- continue;
- }
- if (!empty($out_pages[$i][self::DESCRIPTION])) {
- $seen_descriptions[] = $out_pages[$i][self::DESCRIPTION];
- }
- }
- $out_pages = array_values($out_pages);
return $out_pages;
}
/**