Spread work of computing getSnippets to all servers not just name server, a=chris

Chris Pollett [2019-07-12 21:Jul:th]
Spread work of computing getSnippets to all servers not just name server, a=chris
diff --git a/src/controllers/CrawlController.php b/src/controllers/CrawlController.php
index 8c7648989..91d5e735c 100644
--- a/src/controllers/CrawlController.php
+++ b/src/controllers/CrawlController.php
@@ -237,7 +237,7 @@ class CrawlController extends Controller implements CrawlConstants
         $num = $this->clean($_REQUEST["num"], "int");
         $i = $this->clean($_REQUEST["i"], "int");
         $crawl_model->current_machine = $i;
-        list($lookups, $exclude_fields) =
+        list($lookups, $exclude_fields, $format_words, $description_length) =
         $our_lookups = [];
         foreach ($lookups as $lookup => $lookup_info) {
@@ -260,7 +260,7 @@ class CrawlController extends Controller implements CrawlConstants
         $items = $crawl_model->getCrawlItems($our_lookups, null,
-            $exclude_fields);
+            $exclude_fields, $format_words, $description_length);
         $this->web_site->header("Content-Type: application/octet-stream");
         $items["ELAPSED_TIME"] = L\changeInMicrotime($start_time);
         $items = gzdeflate(serialize($items));
diff --git a/src/models/Model.php b/src/models/Model.php
index a2f00bfaf..f873655ce 100755
--- a/src/models/Model.php
+++ b/src/models/Model.php
@@ -49,7 +49,6 @@ require_once __DIR__."/../library/Utility.php";
 class Model implements CrawlConstants
-    const SCORE_PRECISION = 4;
     const SNIPPET_TITLE_LENGTH = 20;
     const SNIPPET_LENGTH_LEFT = 20;
@@ -169,9 +168,8 @@ class Model implements CrawlConstants
         return file_put_contents($filename, $data);
-     * Given an array page summaries, for each summary extracts snippets which
-     * are related to a set of search words. For each snippet, bold faces the
-     * search terms, and then creates a new summary array.
+     * Given an array page summaries, for each summaru check if url corresponds
+     * to a search result that was human edited, if so, replace and format it.
      * @param array $results web pages summaries (these in turn are
      *     arrays!)
@@ -179,7 +177,7 @@ class Model implements CrawlConstants
      * @param int $description_length length of the description
      * @return array summaries which have been snippified and bold faced
-    public function formatPageResults($results, $words = null,
+    public function addEditedPageResults($results, $words = null,
         $description_length = self::DEFAULT_DESCRIPTION_LENGTH)
         if (isset($results['PAGES'])) {
@@ -214,60 +212,76 @@ class Model implements CrawlConstants
                             $page[$field] = $summary[$field];
+                    $page = $this->formatSinglePageResult($page, $words,
+                        $description_length);
+                    $pages[$i] = $page;
-            if (empty($page[self::TITLE])) {
-                $page[self::TITLE] = "";
-            }
-            $page[self::TITLE] = strip_tags($page[self::TITLE]);
-            $page[self::DESCRIPTION] = strip_tags(
-                preg_replace("/\<\s+([a-zA-Z])/", '<$1',
-                $page[self::DESCRIPTION]));
-            if (strlen($page[self::TITLE]) == 0) {
-                $offset = min(mb_strlen($page[self::DESCRIPTION]),
-                    self::SNIPPET_TITLE_LENGTH);
-                $end_title = mb_strpos($page[self::DESCRIPTION], " ", $offset);
-                $ellipsis = "";
-                if ($end_title > self::SNIPPET_TITLE_LENGTH) {
-                    $ellipsis = "...";
-                    if ($end_title > self::MAX_SNIPPET_TITLE_LENGTH) {
-                        $end_title = self::MAX_SNIPPET_TITLE_LENGTH;
-                    }
-                }
-                $page[self::TITLE] = mb_substr($page[self::DESCRIPTION], 0,
-                    $end_title) . $ellipsis;
-                //still no text revert to url
-                if (strlen($page[self::TITLE]) == 0 &&
-                    isset($page[self::URL])) {
-                    $page[self::TITLE] = $page[self::URL];
+        }
+        $output['TOTAL_ROWS'] = $results['TOTAL_ROWS'];
+        $output['PAGES'] = ($deleted_a_page) ? $pages : array_values($pages);
+        return $output;
+    }
+    /**
+     * Given a page summary, extracts snippets which
+     * are related to a set of search words. For each snippet, bold faces the
+     * search terms, and then creates a new summary array.
+     *
+     * @param array $page a single search result summary
+     * @param array $words keywords (typically what was searched on)
+     * @param int $description_length length of the description
+     * @return array $page which has been snippified and bold faced
+     */
+    public function formatSinglePageResult($page, $words = null,
+        $description_length = self::DEFAULT_DESCRIPTION_LENGTH)
+    {
+        if (empty($page[self::TITLE])) {
+            $page[self::TITLE] = "";
+        }
+        $page[self::TITLE] = strip_tags($page[self::TITLE]);
+        $page[self::DESCRIPTION] = strip_tags(
+            preg_replace("/\<\s+([a-zA-Z])/", '<$1',
+            $page[self::DESCRIPTION]));
+        if (strlen($page[self::TITLE]) == 0) {
+            $offset = min(mb_strlen($page[self::DESCRIPTION]),
+                self::SNIPPET_TITLE_LENGTH);
+            $end_title = mb_strpos($page[self::DESCRIPTION], " ", $offset);
+            $ellipsis = "";
+            if ($end_title > self::SNIPPET_TITLE_LENGTH) {
+                $ellipsis = "...";
+                if ($end_title > self::MAX_SNIPPET_TITLE_LENGTH) {
+                    $end_title = self::MAX_SNIPPET_TITLE_LENGTH;
-            // do a little cleaning on text
-            if ($words != null) {
-                $page[self::TITLE] =
-                    $this->boldKeywords($page[self::TITLE], $words);
-                if (!isset($page[self::IS_FEED])) {
-                    $page[self::DESCRIPTION] =
-                        $this->getSnippets($page[self::DESCRIPTION],
-                        $words, $description_length);
-                }
+            $page[self::TITLE] = mb_substr($page[self::DESCRIPTION], 0,
+                $end_title) . $ellipsis;
+            //still no text revert to url
+            if (strlen($page[self::TITLE]) == 0 &&
+                isset($page[self::URL])) {
+                $page[self::TITLE] = $page[self::URL];
+            }
+        }
+        // do a little cleaning on text
+        if ($words != null) {
+            $page[self::TITLE] =
+                $this->boldKeywords($page[self::TITLE], $words);
+            if (!isset($page[self::IS_FEED])) {
                 $page[self::DESCRIPTION] =
-                    $this->boldKeywords($page[self::DESCRIPTION], $words);
-            } else {
-                $page[self::DESCRIPTION] = mb_substr($page[self::DESCRIPTION],
-                    0, $description_length);
+                    $this->getSnippets($page[self::DESCRIPTION],
+                    $words, $description_length);
-            $pre_description = preg_replace("/\p{C}+|^[^\p{L}]+/u", "",
-                $page[self::DESCRIPTION]);
-            $page[self::DESCRIPTION] = (substr($pre_description,0,2) == "b>") ?
-                "<" . $pre_description : $pre_description;
-            $page[self::SCORE] = mb_substr($page[self::SCORE], 0,
-                self::SCORE_PRECISION);
-            $pages[$i] = $page;
+            $page[self::DESCRIPTION] =
+                $this->boldKeywords($page[self::DESCRIPTION], $words);
+        } else {
+            $page[self::DESCRIPTION] = mb_substr($page[self::DESCRIPTION],
+                0, $description_length);
-        $output['TOTAL_ROWS'] = $results['TOTAL_ROWS'];
-        $output['PAGES'] = ($deleted_a_page) ? $pages : array_values($pages);
-        return $output;
+        $page[self::TITLE] = trim($page[self::TITLE], " .");
+        $pre_description = preg_replace("/\p{C}+|^[^\p{L}]+/u", "",
+            $page[self::DESCRIPTION]);
+        $page[self::DESCRIPTION] = (substr($pre_description, 0, 2) == "b>") ?
+            "<" . $pre_description : $pre_description;
+        return $page;
      * Given a string, extracts a snippets of text related to a given set of
@@ -279,16 +293,30 @@ class Model implements CrawlConstants
      * @param string $text haystack to extract snippet from
      * @param array $words keywords used to make look in haystack
      * @param string $description_length length of the description desired
-     * @param bool $words_change getSnippets might be called many times on
-     *      the same search page with the same $words, if true then the
-     *      preprocessing of $words is avoided and cached versions are used
      * @return string a concatenation of the extracted snippets of each word
-    public function getSnippets($text, $words, $description_length,
-        $words_change = false)
+    public function getSnippets($text, $words, $description_length)
         static $search_words = [];
+        static $last_words = "";
         static $word_regex = "";
+        if (mb_strlen($text) < $description_length) {
+            return $text;
+        }
+        if (empty($words)) {
+            $snippet_string = mb_substr($text, 0, $description_length);
+            $rpos = strrpos($snippet_string, " ");
+            if ($rpos) {
+                $snippet_string = mb_substr($snippet_string, 0, $rpos);
+            }
+            return $snippet_string;
+        }
+        $word_string = implode(" ",  $words);
+        $words_change = false;
+        if ($word_string != $last_words) {
+            $words_change = true;
+            $last_words = $word_string;
+        }
         $start_regex = "/";
         $left = self::SNIPPET_LENGTH_LEFT;
         $left3 = $left - 3;
@@ -297,16 +325,11 @@ class Model implements CrawlConstants
         $start_regex2 = "/\b(\w{3}.{0,$left3})?(?:(?:";
         $end_regex = "/ui";
         $end_regex2 = ").{0,$right}\b)+/ui";
-        if (mb_strlen($text) < $description_length) {
-            return $text;
-        }
         $ellipsis = "";
         if ($words_change || empty($search_words)) {
-            $search_words = [];
-            foreach ($words as $word) {
-                $search_words = array_merge($search_words, explode(" ", $word));
-            }
-            $search_words = array_filter(array_unique($search_words));
+            // orginal list of words might have had space separated phrases;
+            $search_words = array_filter(array_unique(
+                explode(" ", $word_string)));
             $word_regex = "";
             $delim = "";
             foreach ($search_words as $word) {
@@ -322,41 +345,40 @@ class Model implements CrawlConstants
             $len = mb_strlen($text_source);
             $offset = 0;
             if ($len < self::MIN_SNIPPET_LENGTH) {
-                if (preg_match($start_regex . $word_regex.
+                if (preg_match($start_regex . $word_regex .
                     $end_regex, $text_source, $match)) {
                     if (stristr($snippet_string, $text_source) === false) {
-                        $snippet_string .= $ellipsis. $text_source;
+                        $snippet_string .= $ellipsis . $text_source;
                         $ellipsis = " ... ";
-                        if (mb_strlen($snippet_string) >= $description_length) {
-                            break;
-                        }
-                continue;
-            }
-            $word_locations = [];
-            preg_match_all($start_regex2 . $word_regex . $end_regex2,
-                $text_source, $matches);
-            if (isset($matches[0])) {
-                $seen_match = [];
-                foreach ($matches[0] as $match) {
-                    if ($match >= $description_length) {
-                        $match = mb_substr($match, 0, $description_length);
-                        $rpos = strrpos($match, " ");
-                        if ($rpos) {
-                            $match = mb_substr($match, 0, $rpos);
-                        }
-                    }
-                    $match = trim($match, ".");
-                    if (stristr($snippet_string, $match) === false) {
-                        $snippet_string .= $ellipsis. $match;
-                        $ellipsis = " ... ";
-                        if (mb_strlen($snippet_string) >= $description_length) {
-                            break;
+            } else {
+                preg_match_all($start_regex2 . $word_regex . $end_regex2,
+                    $text_source, $matches);
+                if (isset($matches[0])) {
+                    $seen_match = [];
+                    foreach ($matches[0] as $match) {
+                        $match = trim($match, ".");
+                        if (stristr($snippet_string, $match) === false) {
+                            $snippet_string .= $ellipsis. $match;
+                            $ellipsis = " ... ";
+                            if (mb_strlen($snippet_string) >=
+                                $description_length) {
+                                break;
+                            }
+            if (mb_strlen($snippet_string) >= $description_length) {
+                $snippet_string = mb_substr($snippet_string, 0,
+                    $description_length);
+                $rpos = strrpos($snippet_string, " ");
+                if ($rpos) {
+                    $snippet_string = mb_substr($snippet_string, 0, $rpos);
+                }
+                break;
+            }
         return $snippet_string;
@@ -383,7 +405,7 @@ class Model implements CrawlConstants
      * Gets a list of all DBMS that work with the search engine
-     * @return array Names of availabledatasources
+     * @return array Names of available data sources
     public function getDbmsList()
diff --git a/src/models/ParallelModel.php b/src/models/ParallelModel.php
index 1b97675d4..51269b15d 100755
--- a/src/models/ParallelModel.php
+++ b/src/models/ParallelModel.php
@@ -110,17 +110,22 @@ class ParallelModel extends Model
      *      the crawlItem but which should be excluded from the result.
      *      This will make the result smaller and so hopefully faster to
      *      transmit
+     * @param array $format_words words which should be highlighted in
+     *     search snippets returned
+     * @param int $description_length length of snippets to be returned
+     *      for each search result
      * @return array of summary data for the matching documents
     public function getCrawlItems($lookups, $machine_urls = null,
-        $exclude_fields = [])
+        $exclude_fields = [], $format_words = null,
+        $description_length = self::DEFAULT_DESCRIPTION_LENGTH)
         if (!empty($machine_urls) && !$this->isSingleLocalhost($machine_urls)) {
             $summaries = $this->networkGetCrawlItems($lookups, $machine_urls,
-            $exclude_fields);
+            $exclude_fields, $format_words, $description_length);
         } else {
             $summaries = $this->nonNetworkGetCrawlItems($lookups,
-                $exclude_fields);
+                $exclude_fields, $format_words, $description_length);
         return $summaries;
@@ -129,7 +134,7 @@ class ParallelModel extends Model
      * by their url, or by group of 5-tuples of the form
      * (machine, key, index, generation, offset). This makes an execMachines
      * call to make a network request to the CrawlController's on each machine
-     * which in turn calls getCrawlItems (and thence nonNetworkGetCrawlItems)
+         * which in turn calls getCrawlItems (and thence nonNetworkGetCrawlItems)
      * on each machine. The results are then sent back to networkGetCrawlItems
      * and aggregated.
@@ -139,10 +144,15 @@ class ParallelModel extends Model
      *      the crawlItem but which should be excluded from the result.
      *      This will make the result smaller and so hopefully faster to
      *      transmit
+     * @param array $format_words words which should be highlighted in
+     *     search snippets returned
+     * @param int $description_length length of snippets to be returned
+     *      for each search result
      * @return array of summary data for the matching documents
     public function networkGetCrawlItems($lookups, $machine_urls,
-        $exclude_fields = [])
+        $exclude_fields = [], $format_words = null, $description_length =
         //Set-up network request
         $machines = [];
@@ -169,7 +179,8 @@ class ParallelModel extends Model
         //Make request
         $page_set = $this->execMachines("getCrawlItems",
-            $machines, serialize([$lookups, $exclude_fields]), $num_machines);
+            $machines, serialize([$lookups, $exclude_fields,
+            $format_words, $description_length]), $num_machines);
         //Aggregate results
         $summaries = [];
         $elapsed_times = [];
@@ -239,9 +250,15 @@ class ParallelModel extends Model
      *      the crawlItem but which should be excluded from the result.
      *      This will make the result smaller and so hopefully faster to
      *      transmit
+     * @param array $format_words words which should be highlighted in
+     *     search snippets returned
+     * @param int $description_length length of snippets to be returned
+     *      for each search result
      * @return array of summary data for the matching documents
-    public function nonNetworkGetCrawlItems($lookups, $exclude_fields = [])
+    public function nonNetworkGetCrawlItems($lookups, $exclude_fields = [],
+        $format_words = null, $description_length =
         $summary_offset = null;
         $generation = null;
@@ -377,6 +394,13 @@ class ParallelModel extends Model
+        if ($format_words !== null && count($summaries) > 0 &&
+            $description_length > 0) {
+            foreach ($summaries as $key => $summary) {
+                $summaries[$key] = $this->formatSinglePageResult($summary,
+                    $format_words, $description_length);
+            }
+        }
         return $summaries;
@@ -409,7 +433,7 @@ class ParallelModel extends Model
         $num_generations = $index_archive->generation_info['ACTIVE'];
         $hash_key = ($is_key) ? L\crawlHashWord($url_or_key, true) :
-            L\crawlHashWord("info:".$url_or_key, true);
+            L\crawlHashWord("info:" . $url_or_key, true);
         $info = IndexManager::getWordInfo($index_name, $hash_key, 0, 1);
         if (!isset($info[0][4])) {
             return false;
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index 2fa807312..f9f5158b6 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -377,6 +377,13 @@ class PhraseModel extends ParallelModel
+            if ($format) {
+                if (count($format_words) == 0) {
+                    $format_words = null;
+                }
+            } else {
+                $format_words = null;
+            }
             if (C\QUERY_STATISTICS) {
                 $this->query_info['QUERY'] .=
                     "$in2<b>Presentation Parse time</b>: " .
@@ -390,7 +397,7 @@ class PhraseModel extends ParallelModel
             $out_results = $this->getSummariesByHash($word_structs,
                 $low, $phrase_num, $filter, $use_cache_if_allowed, $raw,
                 $queue_servers, $phrase, $save_timestamp_name,
-                $limit_feeds);
+                $limit_feeds, $format_words);
             if (isset($out_results['PAGES']) &&
                 count($out_results['PAGES']) != 0) {
                 $out_count = 0;
@@ -465,38 +472,15 @@ class PhraseModel extends ParallelModel
         } elseif (isset($results['PAGES'])) {
             $results['TOTAL_ROWS'] = count($results['PAGES']);
-        if ($format) {
-            if (count($format_words) == 0) {
-                $format_words = null;
-            }
-        } else {
-            $format_words = null;
-        }
-        $description_length = self::DEFAULT_DESCRIPTION_LENGTH;
-        /* additional meta word come from indexing plugins which might need
-           longer description lengths, say for recipes
-         */
-        if (isset($this->additional_meta_words) &&
-            is_array($this->additional_meta_words)) {
-            foreach ($this->additional_meta_words as $meta_word => $length) {
-                $pattern = "/$meta_word/";
-                if (preg_match($pattern, $input_phrase)) {
-                    $description_length = $length;
-                    break; // only match the first found
-                }
-            }
-        }
         if ($raw == 0 && isset($results['TOTAL_ROWS']) &&
             $results['TOTAL_ROWS'] > 0) {
-            $output = $this->formatPageResults($results, $format_words,
+            $results = $this->addEditedPageResults($results, $format_words,
             if (!empty($answer_score_map)) {
-                $output['BEST_ANSWER'] = key($answer_score_map);
+                $results['BEST_ANSWER'] = key($answer_score_map);
-        } else {
-            $output = $results;
         if (C\QUERY_STATISTICS) {
             $this->query_info['QUERY'] .= "<b>Format Time</b>: ".
@@ -506,7 +490,7 @@ class PhraseModel extends ParallelModel
             $this->db->total_time += $this->query_info['ELAPSED_TIME'];
             $this->db->query_log[] = $this->query_info;
-        return $output;
+        return $results;
      * Parses from a string phrase representing a conjunctive query, a struct
@@ -1045,13 +1029,14 @@ class PhraseModel extends ParallelModel
      *     docs after $save_timestamp 's previous iterate position.
      * @param bool $limit_feeds if true the number of feed shard items to
      *     allow in search results is limited to WordIterator::LIMIT_FEEDS_COUNT
-     *
+     * @param array $format_words words which should be highlighted in
+     *     search snippets returned
      * @return array document summaries
     public function getSummariesByHash($word_structs, $limit, $num, &$filter,
         $use_cache_if_allowed = true, $raw = 0, $queue_servers = [],
         $original_query = "", $save_timestamp_name = "",
-        $limit_feeds = true)
+        $limit_feeds = true, $format_words = null)
         $indent= "&nbsp;&nbsp;";
         $in2 = $indent . $indent;
@@ -1314,10 +1299,26 @@ class PhraseModel extends ParallelModel
         $out_pages = [];
         $cur_limit = $start_slice;
         $with_qa = (preg_match("/\bqqq\b/i", $original_query)) ? true : false;
+        // now calculate snippet length
+        $description_length = self::DEFAULT_DESCRIPTION_LENGTH;
+        /* additional meta word come from indexing plugins which might need
+           longer description lengths, say for recipes
+         */
+        if (isset($this->additional_meta_words) &&
+            is_array($this->additional_meta_words)) {
+            foreach ($this->additional_meta_words as $meta_word => $length) {
+                $pattern = "/$meta_word/";
+                if (preg_match($pattern, $original_query)) {
+                    $description_length = $length;
+                    break; // only match the first found
+                }
+            }
+        }
         while (count($out_pages) < $to_get_count && $get_pages) {
             $out_pages = array_merge($out_pages,
                 $this->getSummariesFromOffsets($get_pages, $queue_servers,
-                $raw, $groups_with_docs, $with_qa));
+                $raw, $groups_with_docs, $with_qa, $format_words,
+                $description_length));
             if ($save_timestamp_name != "") {
@@ -1383,11 +1384,24 @@ class PhraseModel extends ParallelModel
      *     contain at least one doc as opposed to a groups with only links
      * @param bool $with_question_answer_info whether question answer info
      *      in summaries needs to be returned
+     * @param array $format_words words which should be highlighted in
+     *     search snippets returned
+     * @param int $description_length length of snippets to be returned
+     *      for each search result
      * @return array pages with summaries added
-    public function getSummariesFromOffsets(&$pages, &$queue_servers, $raw,
-        $groups_with_docs, $with_question_answer_info)
+    public function getSummariesFromOffsets(&$pages, &$queue_servers,
+        $raw, $groups_with_docs, $with_question_answer_info,
+        $format_words = null, $description_length =
+        if ($raw != 0) {
+            $format_words = null;
+        } else {
+            if ($format_words == null) {
+                $format_words = [];
+            }
+        }
         $lookups = [];
         $summary_exclude_fields = [self::HEADER, self::PAGE, self::LINKS,
@@ -1429,7 +1443,7 @@ class PhraseModel extends ParallelModel
         /* look up items (items we have a link summary for, but not doc
         $summaries = $this->getCrawlItems($lookups, $lookup_queue_servers,
-            $summary_exclude_fields);
+            $summary_exclude_fields, $format_words, $description_length);
         $lookups = [];
         // link summaries we want to remember in case don't have doc summary
         $link_summaries = [];
@@ -1459,7 +1473,7 @@ class PhraseModel extends ParallelModel
         // lookup redirects
         $loc_summaries = $this->getCrawlItems($lookups, $lookup_queue_servers,
-            $summary_exclude_fields);
+            $summary_exclude_fields, $format_words, $description_length);
         // delete summaries we found from $link_summaries
         if (is_array($loc_summaries)) {
             $loc_hashes = array_keys($loc_summaries);
diff --git a/src/views/SearchView.php b/src/views/SearchView.php
index 308df2155..d5ab36eee 100755
--- a/src/views/SearchView.php
+++ b/src/views/SearchView.php
@@ -52,6 +52,10 @@ class SearchView extends View implements CrawlConstants
      * Represent extension of Git urls
     const GIT_EXTENSION = ".git";
+    /**
+     * Number of decimals for search result scores
+     */
+    const SCORE_PRECISION = 4;
      * Draws the main landing pages as well as search result pages
@@ -440,8 +444,8 @@ class SearchView extends View implements CrawlConstants
                         e($label . ":" . number_format($score, 2) . "\n");
-                ?>" ><?=tl('search_view_score', $page[self::SCORE]) ?></span>
-                <?php
+                ?>" ><?=tl('search_view_score',number_format($page[self::SCORE],
+                    self::SCORE_PRECISION))?></span><?php