Attempt to speed up getSummaries, a=chris

Chris Pollett [2022-07-15 23:Jul:th]
Attempt to speed up getSummaries, a=chris
Filename
src/models/PhraseModel.php
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index 42b09abd3..880ae4f91 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -1241,6 +1241,11 @@ class PhraseModel extends ParallelModel
                 $num_retrieved = count($pages);
             }
         }
+        /*
+            at this pount count($pages) >= $to_retrieve or there were
+            less than $to_retrieve docs. Notice it may the case
+            count($pages) is larger than $to_retrieve
+         */
         $retrieve_postings_time = L\changeInMicrotime($retrieve_postings_time);
         if ($save_timestamp_name != "" && ($queue_servers == [] ||
             $this->isSingleLocalhost($queue_servers))) {
@@ -1385,14 +1390,15 @@ class PhraseModel extends ParallelModel
             }
             $summaries_time = microtime(true);
         }
-        $get_pages = array_slice($pages, $start_slice, $to_retrieve);
+        // use 2* $num because might have some dedpulcation/robot exclusions
+        $get_pages = array_slice($pages, $limit, 2 * $num);
         $to_get_count = count($get_pages);
         $groups_with_docs = false;
         if (preg_match("/\bsite:doc\b/", $original_query)) {
             $groups_with_docs = true;
         }
         $out_pages = [];
-        $cur_limit = $start_slice;
+        $cur_limit = $limit;
         $with_qa = (preg_match("/\bqqq\b/i", $original_query)) ? true : false;
         // now calculate snippet length
         $description_length = self::DEFAULT_DESCRIPTION_LENGTH;
@@ -1409,7 +1415,11 @@ class PhraseModel extends ParallelModel
                 }
             }
         }
-        while (count($out_pages) < $to_get_count && $get_pages) {
+        /* this is a loop because getSummariesFromOffsets might eliminate some
+           entries forbidden by robot directives or by deduplication
+         */
+        $count_out_pages = 0;
+        while ($count_out_pages < $to_get_count && $get_pages) {
             $out_pages = array_merge($out_pages,
                 $this->getSummariesFromOffsets($get_pages, $queue_servers,
                 $raw, $groups_with_docs, $with_qa, $format_words,
@@ -1417,9 +1427,10 @@ class PhraseModel extends ParallelModel
             if ($save_timestamp_name != "") {
                 break;
             }
-            $cur_limit += C\MIN_RESULTS_TO_GROUP;
-            $get_pages = array_slice($pages, $cur_limit,
-                C\MIN_RESULTS_TO_GROUP);
+            $cur_limit += $to_get_count;
+            $count_out_pages = count($out_pages);
+            $to_get_count =  2 * ($to_get_count - $count_out_pages);
+            $get_pages = array_slice($pages, $cur_limit, $to_get_count);
         }
         if ($cur_limit > $results['TOTAL_ROWS']) {
             $results['TOTAL_ROWS'] = count($out_pages);
@@ -1427,7 +1438,7 @@ class PhraseModel extends ParallelModel
             $results['TOTAL_ROWS'] = ceil(
                 (count($out_pages) * $results['TOTAL_ROWS']) / $cur_limit);
         }
-        $out_pages = array_slice($out_pages, $limit, $num);
+        $out_pages = array_slice($out_pages, 0, $num);
         if (C\QUERY_STATISTICS) {
             $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES");
             if ($summary_times_string) {
@@ -1583,31 +1594,38 @@ class PhraseModel extends ParallelModel
             $link_summaries);
         $out_pages = [];
         $seen_keys = [];
-        /* insert lookup values back into pages, do deduplication
-           Handle robot meta tags present on page
+        $seen_descriptions = [];
+        /* insert lookup values back into pages, dedeup,
+           handle robot meta tags present on page
          */
         $base_dir = C\APP_DIR . "/resources";
         $i = 0;
         $thumb_sites = [];
         foreach ($pages as $page) {
             $key = $page[self::KEY];
+            $add_page = false;
             if (isset($summaries[$key]) &&
-                !in_array($key, $seen_keys)) {
-                $summary = & $summaries[$key];
-                $seen_keys[] = $key;
-                $pre_page = array_merge($page, $summary);
-                if (isset($pre_page[self::ROBOT_METAS])) {
-                    if (!in_array("NOINDEX", $pre_page[self::ROBOT_METAS])
-                         &&
-                        !in_array("NONE", $pre_page[self::ROBOT_METAS])) {
+                !isset($seen_keys[$key])) {
+                $seen_keys[$key] = true;
+                $hash_description =
+                    L\crawlHash($summaries[$key][self::DESCRIPTION] ?? "");
+                if (isset($seen_descriptions[$hash_description])) {
+                    continue;
+                }
+                $seen_descriptions[$hash_description] = true;
+                $metas = $summaries[$key][self::ROBOT_METAS] ?? [];
+                if (!empty($metas)) {
+                    if (!in_array("NOINDEX", $metas) &&
+                        !in_array("NONE", $metas)) {
                         $add_page = true;
                     }
                 } else {
                     $add_page = true;
                 }
                 if (!empty($add_page)) {
-                    if (!empty($pre_page[self::IMAGE_LINK])) {
-                        $image_link = $pre_page[self::IMAGE_LINK];
+                    $out_pages[$i] = array_merge($page, $summaries[$key]);
+                    if (!empty($out_pages[$i][self::IMAGE_LINK])) {
+                        $image_link = $out_pages[$i][self::IMAGE_LINK];
                         $link_scheme = substr($image_link, 0, 7);
                         if ($link_scheme == "feed://") {
                             $image_link_parts = explode("/", $image_link);
@@ -1631,34 +1649,18 @@ class PhraseModel extends ParallelModel
                             if (file_exists($image_path)) {
                                 $image_string = $this->fileGetContents(
                                     $image_path);
-                                $pre_page[self::IMAGE_LINK] =
+                                $out_pages[$i][self::IMAGE_LINK] =
                                     'data:image/jpeg;base64,' .
                                     base64_encode($image_string);
                             } else {
-                                unset($pre_page[self::IMAGE_LINK]);
+                                unset($out_pages[$i][self::IMAGE_LINK]);
                             }
                         }
                     }
-                    $out_pages[$i] = $pre_page;
                     $i++;
                 }
             }
         }
-        $cnt = count($out_pages);
-        $seen_descriptions = [];
-        for ($i = 0; $i < $cnt; $i++) {
-            if (($groups_with_docs && (empty($out_pages[$i][self::IS_DOC]) ||
-                !empty($out_pages[$i][self::LOCATION]))) ||
-                in_array($out_pages[$i][self::DESCRIPTION],
-                $seen_descriptions)) {
-                unset($out_pages[$i]);
-                continue;
-            }
-            if (!empty($out_pages[$i][self::DESCRIPTION])) {
-                $seen_descriptions[] = $out_pages[$i][self::DESCRIPTION];
-            }
-        }
-        $out_pages = array_values($out_pages);
         return $out_pages;
     }
     /**
ViewGit