More work on ranking factors, pass for network queries, don't due proximity unless at least 2 non-meta terms, a=chris

Chris Pollett [2022-07-28 03:Jul:th]
More work on ranking factors, pass for network queries, don't due proximity unless at least 2 non-meta terms, a=chris
Filename
src/configs/Config.php
src/configs/PublicHelpPages.php
src/controllers/SearchController.php
src/controllers/components/CrawlComponent.php
src/data/public_default.db
src/library/IndexDocumentBundle.php
src/library/UrlParser.php
src/library/index_bundle_iterators/NetworkIterator.php
src/library/index_bundle_iterators/WordIterator.php
src/locale/ar/configure.ini
src/locale/bn/configure.ini
src/locale/de/configure.ini
src/locale/el_GR/configure.ini
src/locale/en_US/configure.ini
src/locale/es/configure.ini
src/locale/fa/configure.ini
src/locale/fr_FR/configure.ini
src/locale/he/configure.ini
src/locale/hi/configure.ini
src/locale/id/configure.ini
src/locale/it/configure.ini
src/locale/ja/configure.ini
src/locale/kn/configure.ini
src/locale/ko/configure.ini
src/locale/nl/configure.ini
src/locale/pl/configure.ini
src/locale/pt/configure.ini
src/locale/ru/configure.ini
src/locale/te/configure.ini
src/locale/th/configure.ini
src/locale/tl/configure.ini
src/locale/tr/configure.ini
src/locale/vi_VN/configure.ini
src/locale/vi_VN/statistics.txt
src/locale/zh_CN/configure.ini
src/models/PhraseModel.php
src/models/ProfileModel.php
src/views/elements/PageoptionsElement.php
src/views/elements/SearchElement.php
tests/IndexDocumentBundleTest.php
diff --git a/src/configs/Config.php b/src/configs/Config.php
index 574b60964..7ee53291f 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -507,15 +507,28 @@ if (file_exists(WORK_DIRECTORY . PROFILE_FILE_NAME)) {
     nsdefine('SUBSEARCH_LINK', true);
     /**
      *  Bonus to add to relevance score if the hostname contains the search
-     *  term.
+     *  term. If hostname has 1 keyword bonus is 6, if host has 3 keywords
+     *  each would get 2.
      */
-    nsdefine('HOST_KEYWORD_BONUS', 10);
+    nsdefine('HOST_KEYWORD_BONUS', 6);
+    /**
+     *  Bonus to add to relevance score if the title contains the search
+     *  term. If the title has 1 term bonus would be 5 if 10 terms then 0.5
+     *  each
+     */
+    nsdefine('TITLE_BONUS', 5);
+    /**
+     *  Bonus to add to relevance score if the url path contains the search
+     *  term. If the path has 1 term bonus would be 3 if 3 terms then 1
+     *  each
+     */
+    nsdefine('PATH_KEYWORD_BONUS', 3);
     /**
      *  Bonus to add to doc rank score if the url is a company level domain
      */
     nsdefine('CLD_URL_BONUS', 2);
     /**
-     *  Bonus to add to doc rank score if url is a a hostname
+     *  Bonus to add to doc rank score if the url is a a hostname
      */
     nsdefine('HOST_URL_BONUS', 0.5);
     /**
diff --git a/src/configs/PublicHelpPages.php b/src/configs/PublicHelpPages.php
index 6a318b654..7ae6984b9 100644
--- a/src/configs/PublicHelpPages.php
+++ b/src/configs/PublicHelpPages.php
@@ -42605,7 +42605,9 @@ url_shortener=

 END_HEAD_VARSThe score used to rank a page is computed as the document rank score + a relevance score. This score is then combined with a proximity score if the query has more than two terms using reciprocal rank fusion. Below are some bonuses which may be applied to the document rank and relevance scores.

-; '''Host Keyword Bonus''' :  Bonus to add to relevance score if the hostname contains the search term.
+; '''Host Keyword Bonus''' :  Potential bonus to add to relevance score. The number of occurrences of search term divided by the number of host name keywords is the fraction of this bonus that will be added to the relevance score.
+; '''Title Bonus''' :  Potential bonus to add to relevance score. The number of occurrences of search term divided by the number of words in the title is the fraction of this bonus that will be added to the relevance score.
+; '''Path Bonus''' :  Potential bonus to add to relevance score. The number of occurrences of search term divided by the number of words in the path portion of the url is the fraction of this bonus that will be added to the relevance score.
 ; '''CLD Url Bonus''' : Bonus to add to doc rank score if the url is a company level domain.
 ; '''Host Url Bonus''' : Bonus to add to doc rank score if the url is a a hostname.

diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php
index d9ed7d1e3..d36888bd8 100755
--- a/src/controllers/SearchController.php
+++ b/src/controllers/SearchController.php
@@ -126,7 +126,8 @@ class SearchController extends Controller implements CrawlConstants
         if (!$format_info) {
             return;
         }
-        list($view, $web_flag, $raw, $results_per_page, $limit) = $format_info;
+        list($view, $web_flag, $raw, $results_per_page, $limit,
+            $ranking_factors) = $format_info;
         list($index_timestamp, $index_info, $save_timestamp) =
             $this->initializeIndexInfo($web_flag, $raw, $data);
         unset($_SESSION['LAST_ACTIVITY']);
@@ -134,7 +135,7 @@ class SearchController extends Controller implements CrawlConstants
             if (!in_array($activity, ["cache", "trending"])) {
                 $this->processQuery($data, $query, $activity, $arg,
                     $results_per_page, $limit, $index_timestamp, $raw,
-                    $save_timestamp);
+                    $save_timestamp, $ranking_factors);
                     // calculate the results of a search if there is one
             } else if ($activity == "cache") {
                 if (isset($_REQUEST['repository'])) {
@@ -319,7 +320,22 @@ class SearchController extends Controller implements CrawlConstants
         } else {
             $limit = 0;
         }
-        return [$view, $web_flag, $raw, $results_per_page, $limit];
+        $ranking_factors = [];
+        foreach (["cld_url_bonus" => C\CLD_URL_BONUS,
+            "host_url_bonus" => C\HOST_URL_BONUS,
+            "host_keyword_bonus" => C\HOST_KEYWORD_BONUS,
+            "path_keyword_bonus" => C\PATH_KEYWORD_BONUS,
+            "title_bonus" => C\TITLE_BONUS,
+            ] as $factor => $default) {
+            if (isset($_REQUEST[$factor])) {
+                $ranking_factors[strtoupper($factor)] =
+                    $this->clean($_REQUEST[$factor], "float");
+            } else {
+                $ranking_factors[strtoupper($factor)] = $default;
+            }
+        }
+        return [$view, $web_flag, $raw, $results_per_page, $limit,
+            $ranking_factors];
     }
     /**
      * Determines if query results are using a subsearch, and if so
@@ -1201,10 +1217,12 @@ EOD;
      *     use of the timestamp. $save_time_stamp may also be in the format
      *     of string timestamp-query_part to handle networked queries involving
      *     presentations
+     * @param array $ranking_factors field say how url, keywords, and
+     *     title words should influence relevance and doc rank calculations
      */
     public function processQuery(&$data, $query, $activity, $arg,
         $results_per_page, $limit = 0, $index_name = 0, $raw = 0,
-        $save_timestamp = 0)
+        $save_timestamp = 0, $ranking_factors = [])
     {
         $no_index_given = false;
         $crawl_model = $this->model("crawl");
@@ -1279,7 +1297,8 @@ EOD;
                 $phrase_results = $phrase_model->getPhrasePageResults(
                     $top_query, $limit, $to_show, false,
                     $verticals_model, $use_cache_if_possible, $raw,
-                    $queue_servers, $guess_semantics, $save_timestamp);
+                    $queue_servers, $guess_semantics, $save_timestamp,
+                    $ranking_factors);
                 $data['PAGING_QUERY']['a'] = 'related';
                 $data['PAGING_QUERY']['arg'] = $url;
                 if (!empty($this->subsearch_name)) {
@@ -1299,7 +1318,8 @@ EOD;
                         $phrase_model->getPhrasePageResults(
                             $query, $limit, $to_show, true, $verticals_model,
                             $use_cache_if_possible, $raw, $queue_servers,
-                            $guess_semantics, $save_timestamp);
+                            $guess_semantics, $save_timestamp,
+                            $ranking_factors);
                     $query = $original_query;
                     if ($limit == 0) {
                         $callout_info = $verticals_model->getKnowledgeWiki(
@@ -1737,11 +1757,12 @@ EOD;
      * @param int $save_timestamp if this timestamp is nonzero, then save
      *     iterate position, so can resume on future queries that make
      *     use of the timestamp
-     *
+     * @param array $ranking_factors field say how url, keywords, and
+     *     title words should influence relevance and doc rank calculations
      * @return array associative array of results for the query performed
      */
     public function queryRequest($query, $results_per_page, $limit = 0,
-        $grouping = 0, $save_timestamp = 0)
+        $grouping = 0, $save_timestamp = 0, $ranking_factors = [])
     {
         if (!C\API_ACCESS) {
             return null;
@@ -1749,7 +1770,7 @@ EOD;
         $grouping = ($grouping > 0 ) ? 2 : 0;
         $data = [];
         $this->processQuery($data, $query, "query", "", $results_per_page,
-                $limit, 0, $grouping, $save_timestamp);
+                $limit, 0, $grouping, $save_timestamp, $ranking_factors);
         return $data;
     }
     /**
diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php
index 41deceaad..fc3f58f7c 100644
--- a/src/controllers/components/CrawlComponent.php
+++ b/src/controllers/components/CrawlComponent.php
@@ -1692,7 +1692,8 @@ class CrawlComponent extends Component implements CrawlConstants
                 'SIMILAR_LINK', 'IN_LINK',  'MORE_RESULT', 'RESULT_SCORE',
                 'SIGNIN_LINK', 'SUBSEARCH_LINK', 'WORD_CLOUD', 'WORD_SUGGEST']);
         }
-        $bonuses = ['HOST_KEYWORD_BONUS' => 10,
+        $bonuses = ['HOST_KEYWORD_BONUS' => 6,
+            'TITLE_BONUS' => 5, 'PATH_KEYWORD_BONUS' => 3,
             'CLD_URL_BONUS' => 2, 'HOST_URL_BONUS' => 0.5,
             'MIN_RESULTS_TO_GROUP' => C\MIN_RESULTS_TO_GROUP];
         $change = false;
@@ -1702,7 +1703,7 @@ class CrawlComponent extends Component implements CrawlConstants
                     );
                 $profile[$bonus] = $data[$bonus];
                 $change = true;
-            } else if (!empty($profile[$bonus])){
+            } else if (isset($profile[$bonus]) && $profile[$bonus] != "") {
                 $data[$bonus] = $profile[$bonus];
             } else {
                 $data[$bonus] = $value;
@@ -2027,7 +2028,7 @@ class CrawlComponent extends Component implements CrawlConstants
                 $host_words = UrlParser::getWordsInHostUrl($site[self::URL]);
                 $path_words = UrlParser::getWordsLastPathPartUrl(
                     $site[self::URL]);
-                $phrase_string = $host_words . " .. ".$site[self::TITLE] .
+                $phrase_string = $host_words . " .. " . $site[self::TITLE] .
                     " ..  ". $path_words . " .. ". $site[self::DESCRIPTION];
                 if (empty($site[self::LANG])) {
                     $lang = L\guessLocaleFromString($phrase_string,
diff --git a/src/data/public_default.db b/src/data/public_default.db
index ed85c7a25..6a6900c34 100644
Binary files a/src/data/public_default.db and b/src/data/public_default.db differ
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index 1cfc75ea8..a1ed42607 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -745,7 +745,9 @@ class IndexDocumentBundle implements CrawlConstants
             $link_to = "";
         }
         $word_lists = [];
-        $title_length = 0;
+        $host_keywords_end_pos = 0;
+        $title_end_pos = 0;
+        $path_keywords_end_pos = 0;
         $triplet_lists = [];
         /*
             self::JUST_METAS check to avoid getting sitemaps in results
@@ -764,14 +766,18 @@ class IndexDocumentBundle implements CrawlConstants
                         $site[self::LANG]])) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
-                        $phrase_string = $host_words . " ".
-                            $site[self::TITLE] . " ". $path_words .
-                            " tztzlzngth ". $site[self::DESCRIPTION];
+                        /* r6t was chosen as short enough not to be
+                           changed by chargramming, but rare enough
+                           that can be used as a useful splitter
+                         */
+                        $phrase_string = $host_words . " r6t ".
+                            $site[self::TITLE] . " r6t ". $path_words .
+                            " r6t ". $site[self::DESCRIPTION];
                     }
                 } else {
-                    $phrase_string = $host_words . " " .
-                        $site[self::TITLE] . " ". $path_words .
-                        " tztzlzngth ". $site[self::DESCRIPTION];
+                    $phrase_string = $host_words . " r6t " .
+                        $site[self::TITLE] . " r6t " . $path_words .
+                        " r6t ". $site[self::DESCRIPTION];
                 }
             }
             /* at this point we have already extracted meta words,
@@ -790,9 +796,17 @@ class IndexDocumentBundle implements CrawlConstants
             $word_and_qa_lists = PhraseParser::extractPhrasesInLists(
                 $phrase_string, $lang);
             $word_lists = $word_and_qa_lists['WORD_LIST'];
-            if (!empty($word_lists["tztzlzngth"][0])) {
-                $title_length = $word_lists["tztzlzngth"][0] + 1;
-                unset($word_lists["tztzlzngth"]);
+            if (!empty($word_lists["r6t"][2])) {
+                if ($path_keywords_end_pos < 255) {
+                    $host_keywords_end_pos = $word_lists["r6t"][0];
+                    $title_end_pos = $word_lists["r6t"][1];
+                }
+                $path_keywords_end_pos = $word_lists["r6t"][2];
+                unset($word_lists["r6t"]);
+            } else if (!empty($word_lists["r6t"])) {
+                $path_keywords_end_pos = $word_lists["r6t"][
+                    count($word_lists["r6t"]) - 1];
+                unset($word_lists["r6t"]);
             }
         }
         $description_scores =
@@ -812,8 +826,8 @@ class IndexDocumentBundle implements CrawlConstants
             return "";
         }
         $this->addScoresDocMap($doc_id, $num_words,
-            $url_info[self::SCORE], $title_length, $description_scores,
-            $user_ranks);
+            $url_info[self::SCORE], $host_keywords_end_pos, $title_end_pos,
+            $path_keywords_end_pos, $description_scores, $user_ranks);
         $this->addTermPostingLists(0, $num_words,
             $word_lists, $meta_ids, $this->doc_map_counter);
         $this->doc_map_counter++;
@@ -877,19 +891,27 @@ class IndexDocumentBundle implements CrawlConstants
      * @param int $num_words number of terms in the document associated with the
      *  doc-id
      * @param float $score overall score for the important of this document
-     * @param int $title_length length of the title portion of the document
-     *  summary in terms
+     * @param int $host_keywords_end_pos end of the  portion of the
+     *  document summary containing terms coming from the hostname
+     * @param int $title_end_pos end of the portion of the document
+     *  summary containing terms in the title
+     * @param int $path_keywords_end_pos length of the portion of the
+     *  document summary containing terms in the url path
      * @param array $description_scores pairs of the form (length of summary
      *  portion, score for that portion)
      * @param array $user_ranks for each user defined classifier for this crawl
      *  the float score of the classifier on this document
      */
     public function addScoresDocMap($doc_id, $num_words, $score,
-        $title_length, $description_scores, $user_ranks)
+        $host_keywords_end_pos, $title_end_pos, $path_keywords_end_pos,
+        $description_scores, $user_ranks)
     {
         $num_description_scores = count($description_scores);
+        $preface_positions =
+            ((($host_keywords_end_pos << 8) + $title_end_pos) << 8) +
+            $path_keywords_end_pos;
         $out_rows = [["POS" => $num_words, "SCORE" => floatval($score)],
-            ["POS" => $title_length, "SCORE" =>
+            ["POS" => $preface_positions, "SCORE" =>
                 floatval($num_description_scores)]];
         foreach($description_scores as $position => $score) {
             $out_rows[] = ["POS" => $position, "SCORE" => floatval($score)];
diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php
index 4a869c19a..38291640f 100755
--- a/src/library/UrlParser.php
+++ b/src/library/UrlParser.php
@@ -518,7 +518,7 @@ class UrlParser
             return "";
         }
         $host = $url_parts['host'];
-        $host_parts = preg_split("/\.|\-/", $host);
+        $host_parts = preg_split("/\.|\-/u", $host);
         if (count($host_parts) <= 1) {
             return "";
         }
diff --git a/src/library/index_bundle_iterators/NetworkIterator.php b/src/library/index_bundle_iterators/NetworkIterator.php
index 9046158ad..aa7f6e4ec 100644
--- a/src/library/index_bundle_iterators/NetworkIterator.php
+++ b/src/library/index_bundle_iterators/NetworkIterator.php
@@ -86,6 +86,12 @@ class NetworkIterator extends IndexBundleIterator
      * @var int
      */
     public $hard_query;
+    /**
+     * How url, keywords, and title words should influence relevance
+     * and doc rank calculations
+     * @var array
+     */
+    public $ranking_factors;
     /** Host Key position + 1 (first char says doc, inlink or eternal link)*/
     const HOST_KEY_POS = 17;
     /** Length of a doc key*/
@@ -106,9 +112,11 @@ class NetworkIterator extends IndexBundleIterator
      *      so the queries on those machine can make savepoints. Note the
      *      format of save_timestamp is timestamp-query_part where query_part
      *      is the number of the item in a query presentation (usually 0).
+     * @param array $ranking_factors field say how url, keywords, and
+     *     title words should influence relevance and doc rank calculations
      */
     public function __construct($query, $queue_servers, $timestamp,
-        $filter = null, $save_timestamp_name = "")
+        $filter = null, $save_timestamp_name = "", $ranking_factors = [])
     {
         $this->results_per_block = ceil(C\MIN_RESULTS_TO_GROUP);
         $num_servers = max(1, count($queue_servers));
@@ -118,6 +126,15 @@ class NetworkIterator extends IndexBundleIterator
         $this->hard_query = false;
         $this->base_query = "q=" . urlencode($query).
             "&f=serial&network=false&raw=1&its=$timestamp&guess=false";
+        foreach (["cld_url_bonus" => C\CLD_URL_BONUS,
+            "host_url_bonus" => C\HOST_URL_BONUS,
+            "host_keyword_bonus" => C\HOST_KEYWORD_BONUS,
+            "path_keyword_bonus" => C\PATH_KEYWORD_BONUS,
+            "title_bonus" => C\TITLE_BONUS,
+            ] as $factor => $default) {
+            $this->base_query .= "&$factor=" . ($ranking_factors[$factor] ??
+                $default);
+        }
         if ($save_timestamp_name != "") {
             // used for archive crawls of crawl mixes
             $this->base_query .= "&save_timestamp=$save_timestamp_name";
diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php
index b37e72a3d..c48145745 100644
--- a/src/library/index_bundle_iterators/WordIterator.php
+++ b/src/library/index_bundle_iterators/WordIterator.php
@@ -145,6 +145,12 @@ class WordIterator extends IndexBundleIterator
      * @var int
      */
     public $num_generations;
+    /**
+     * How url, keywords, and title words should influence relevance
+     * and doc rank calculations
+     * @var array
+     */
+    public $ranking_factors;
     /**
      * First shard generation that word info was obtained for
      * @var int
@@ -176,10 +182,13 @@ class WordIterator extends IndexBundleIterator
      *      added. Note: this value is not saved permanently. So you
      *      could in theory open two read only versions of the same bundle but
      *      reading the results in different directions
+     * @param array $ranking_factors field say how url, keywords, and
+     *     title words should influence relevance and doc rank calculations
      */
     public function __construct($word_key, $index_name, $raw = false,
         $filter = null, $results_per_block =
-        IndexBundleIterator::RESULTS_PER_BLOCK, $direction=self::ASCENDING)
+        IndexBundleIterator::RESULTS_PER_BLOCK, $direction=self::ASCENDING,
+        $ranking_factors = [])
     {
         if ($raw == false) {
             //get rid of our modified base64 encoding
@@ -197,6 +206,15 @@ class WordIterator extends IndexBundleIterator
         $this->current_block_fresh = false;
         $this->start_generation = ($direction == self::ASCENDING) ? 0 :
             "ACTIVE";
+        foreach (["CLD_URL_BONUS" => C\CLD_URL_BONUS,
+            "HOST_URL_BONUS" => C\HOST_URL_BONUS,
+            "HOST_KEYWORD_BONUS" => C\HOST_KEYWORD_BONUS,
+            "PATH_KEYWORD_BONUS" => C\PATH_KEYWORD_BONUS,
+            "TITLE_BONUS" => C\TITLE_BONUS,
+            ] as $factor => $default) {
+            $this->ranking_factors[$factor] = $ranking_factors[$factor] ??
+                $default;
+        }
         if (!$this->empty) {
             $this->reset();
         }
@@ -487,28 +505,40 @@ class WordIterator extends IndexBundleIterator
                 if(L\IndexDocumentBundle::isAHostDocId($doc_key)) {
                     $posting[self::DOC_RANK] +=
                         (L\IndexDocumentBundle::isACldDocId($doc_key)) ?
-                        C\CLD_URL_BONUS : C\HOST_URL_BONUS;
+                        $this->ranking_factors["CLD_URL_BONUS"] :
+                        $this->ranking_factors["HOST_URL_BONUS"];
                 }
             }
-            list($posting['TITLE_LENGTH'], $num_description_scores) =
+            list($preface_positions, $num_description_scores) =
                 array_values(array_shift($doc_info));
+            $posting["PATH_KEYWORDS_END_POS"] = ($preface_positions & 255);
+            $preface_positions = $preface_positions >> 8;
+            $posting["TITLE_END_POS"] = ($preface_positions & 255);
+            $preface_positions = $preface_positions >> 8;
+            $posting["HOST_KEYWORDS_END_POS"] = ($preface_positions & 255);
             $posting[self::DESCRIPTION_SCORES] = array_slice($doc_info, 0,
                 $num_description_scores);
             if ($posting['FREQUENCY'] > 0) {
-                list($frequency, $appearance_index) =
-                    $this->frequencyNormalizationFirstAppearance(
-                    $posting[self::DOC_LEN],
+                list($frequency, $preface_score) =
+                    $this->frequencyNormalizationPrefaceScoring(
                     $posting[self::POSITION_LIST],
+                    $posting[self::DOC_LEN],
+                    $posting["HOST_KEYWORDS_END_POS"],
+                    $posting["TITLE_END_POS"],
+                    $posting["PATH_KEYWORDS_END_POS"],
                     $posting[self::DESCRIPTION_SCORES]);
+                // Divergence-from-randomness + preface score
                 $posting[self::RELEVANCE] =
-                    (log(1 + $occurrences_per_doc, 2) + $frequency *
-                    log(1 + 1/$occurrences_per_doc, 2))/ ($frequency + 1);
-                if ($appearance_index == 0) {
-                    $posting[self::RELEVANCE] +=
-                        floatval(C\HOST_KEYWORD_BONUS);
-                }
+                    ((log(1 + $occurrences_per_doc, 2) + $frequency *
+                    log(1 + 1/$occurrences_per_doc, 2))/ ($frequency + 1))
+                    + $preface_score;
             } else {
-                 $posting[self::RELEVANCE] = 1;
+                 /*
+                   this will typically be the relaveance score for a meta word
+                   As will always be frequency 1 and have no position info
+                   set close to 0. (Not zero to avoid div by 0's)
+                  */
+                 $posting[self::RELEVANCE] = 0.01;
             }
             $posting[self::SCORE] = $posting[self::DOC_RANK] +
                 $posting[self::RELEVANCE];
@@ -526,38 +556,67 @@ class WordIterator extends IndexBundleIterator
      * Normalizes the frequencies of a term within a document with respect to
      * the length of the document, the positions of the term with the document
      * and the overall importance score for a given position within the document
-     * also computes the index of the description score of the first appearance
-     * of the term. If this is 0 it means that the term appeared in the host
-     * name.
+     * Also computes the score of the posting for the host keywords,
+     * title keywords, and path keywords.
      *
-     * @param int $num_words number of terms in the document
      * @param array $positions positions of this iterators term in the document
+     * @param int $num_words number of terms in the document
+     * @param int $host_keywords_end_pos term offset into the document summary
+     *  that demarks the end of the host keywords portion of the summary
+     * @param int $title_end_pos absolute term offset into the document summary
+     *  that demarks the end of the title portion of the summary
+     * @param int $path_keywords_end_pos absolute term offset into the document
+     *  summary that demarks the end of the title portion of the summary
      * @param array $descriptions_scores boundaries and scores of different
      *  regions with document
-     * @return array [normalized frequency, index of description score first
-     *      appear at]
+     * @return array [normalized frequency, score for host name, title,
+     *     and path keywords]
      */
-    public function frequencyNormalizationFirstAppearance($num_words,
-        $positions, $descriptions_scores)
+    public function frequencyNormalizationPrefaceScoring(
+        $positions, $num_words, $host_keywords_end_pos,
+        $title_end_pos, $path_keywords_end_pos, $descriptions_scores)
     {
         $num_words = max($num_words, 1);
-        $length_normalization = floatval(C\MAX_DESCRIPTION_LEN) /
-            floatval($num_words);
+        /*
+         * Amati and van Rijsbergen suggest a normalization of
+         * log_2(1 + l_avg/l_d) for divergence-from-randomness
+         * Here l_avg = average num words in a document, l_d = num words
+         * current document. C\MAX_DESCRIPTION_LEN is the max number
+         * of characters in a document. Assuming the average word is
+         * around 5 chars + whitespace char + punctuation, and most documents
+         * are summuarized, to close to the max character length, we
+         * approximate l_avg as C\MAX_DESCRIPTION_LEN/7 in the below.
+         */
+        $length_normalization = log(1 + C\MAX_DESCRIPTION_LEN/(7 * $num_words),
+            2);
         $first_index = 0;
         $old_pos = 0;
         if (empty($descriptions_scores)) {
             return count($positions);
         }
-        $first_score = $descriptions_scores[0]['SCORE'] ?? 1;
         $num_scores = count($descriptions_scores);
         $weighted_frequency = 0;
-        $first_appearance_index = 0;
-        $first_flag = true;
-        foreach ($positions as  $position) {
-            /* we assume positions sorted, so first index will always be larger
-               than last
-             */
+        $preface_score = 0;
+        foreach ($positions as $position) {
+            if ($position < $host_keywords_end_pos) {
+                $preface_score += $this->ranking_factors["HOST_KEYWORD_BONUS"] /
+                    max($host_keywords_end_pos - 1, 1);
+                continue;
+            } else if ($position < $title_end_pos) {
+                $preface_score += $this->ranking_factors["TITLE_BONUS"] /
+                    max($title_end_pos - $host_keywords_end_pos, 1);
+                continue;
+            } else if ($position < $path_keywords_end_pos) {
+                $preface_score += $this->ranking_factors["PATH_KEYWORD_BONUS"] /
+                    max($path_keywords_end_pos - $title_end_pos, 1);
+                continue;
+            }
             $last_index = $num_scores - 1;
+            /* description score offsets are with respect to the description
+               only so we subtract from the term position the offset of the
+               non-description
+             */
+            $position -= ($path_keywords_end_pos + 1);
             while ($first_index < $last_index) {
                 $mid_index = ceil(($first_index + $last_index)/2.0);
                 if ($descriptions_scores[$mid_index]['POS'] > $position) {
@@ -566,15 +625,11 @@ class WordIterator extends IndexBundleIterator
                     $first_index = $mid_index;
                 }
             }
-            if ($first_flag) {
-                $first_appearance_index = $first_index;
-                $first_flag = 0;
-            }
             $weight = $descriptions_scores[$first_index]['SCORE'];
             $weighted_frequency += $weight;
         }
         $frequency = $weighted_frequency * $length_normalization;
-        return [$frequency, $first_appearance_index];
+        return [$frequency, $preface_score];
     }
     /**
      * Updates the seen_docs count during an advance() call
diff --git a/src/locale/ar/configure.ini b/src/locale/ar/configure.ini
index a12a65f32..6e8ecb831 100755
--- a/src/locale/ar/configure.ini
+++ b/src/locale/ar/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "عنوان IP"
 pageoptions_element_result_score = "نتيجة النتيجة"
 pageoptions_element_ranking_factors = "البحث ترتيب العوامل"
 pageoptions_element_host_keyword_bonus = "مكافأة استضافة الكلمات الرئيسية :"
+pageoptions_element_title_bonus = "عنوان مكافأة الكلمات الرئيسية"
+pageoptions_element_path_keyword_bonus = "مكافأة مسار الكلمات الرئيسية"
 pageoptions_element_cld_url_bonus = "مكافأة رابط نطاق الشركة :"
 pageoptions_element_host_url_bonus = "مكافأة عنوان المضيف :"
 pageoptions_element_results_grouping_options = "تجميع نتائج البحث"
diff --git a/src/locale/bn/configure.ini b/src/locale/bn/configure.ini
index 7e6813791..59e4bdbe4 100755
--- a/src/locale/bn/configure.ini
+++ b/src/locale/bn/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "আইপি ঠিকানা"
 pageoptions_element_result_score = "ফলে স্কোর"
 pageoptions_element_ranking_factors = "সার্চ র্যাংকিং কারণের"
 pageoptions_element_host_keyword_bonus = "হোস্ট কীওয়ার্ড বোনাস:"
+pageoptions_element_title_bonus = "শিরোনাম কীওয়ার্ড বোনাস"
+pageoptions_element_path_keyword_bonus = "পাথ কীওয়ার্ড বোনাস"
 pageoptions_element_cld_url_bonus = "কোম্পানির ডোমেন ইউআরএল বোনাস:"
 pageoptions_element_host_url_bonus = "হোস্ট ইউআরএল বোনাস:"
 pageoptions_element_results_grouping_options = "অনুসন্ধান ফলাফল জোট"
diff --git a/src/locale/de/configure.ini b/src/locale/de/configure.ini
index 09080a7dd..931a9e430 100755
--- a/src/locale/de/configure.ini
+++ b/src/locale/de/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "Die IP-Adresse"
 pageoptions_element_result_score = "Ergebnis Punktzahl"
 pageoptions_element_ranking_factors = "Suche Ranking-Faktoren"
 pageoptions_element_host_keyword_bonus = "Schlüsselwortbonus für Gastgeber :"
+pageoptions_element_title_bonus = "Titel Stichwort Bonus"
+pageoptions_element_path_keyword_bonus = "Pfad-Schlüsselwort-Bonus"
 pageoptions_element_cld_url_bonus = "Firmen-Domain-URL-Bonus:"
 pageoptions_element_host_url_bonus = "Host-URL-Prämie:"
 pageoptions_element_results_grouping_options = "Suchergebnisse Gruppierung"
diff --git a/src/locale/el_GR/configure.ini b/src/locale/el_GR/configure.ini
index 2f2ff58bb..5d0c9c8da 100644
--- a/src/locale/el_GR/configure.ini
+++ b/src/locale/el_GR/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "Διεύθυνση IP"
 pageoptions_element_result_score = "Βαθμολογία Αποτελεσμάτων"
 pageoptions_element_ranking_factors = "Παράγοντες κατάταξης αναζήτησης"
 pageoptions_element_host_keyword_bonus = "Μπόνους Λέξεων-Κλειδιών Υποδοχής :"
+pageoptions_element_title_bonus = "Τίτλος Λέξη-Κλειδί Μπόνους"
+pageoptions_element_path_keyword_bonus = "Διαδρομή Λέξη-Κλειδί Μπόνους"
 pageoptions_element_cld_url_bonus = "Μπόνους Διεύθυνσης URL Τομέα Εταιρείας :"
 pageoptions_element_host_url_bonus = "Μπόνους URL Υποδοχής :"
 pageoptions_element_results_grouping_options = "Ομαδοποίηση αποτελεσμάτων αναζήτησης"
diff --git a/src/locale/en_US/configure.ini b/src/locale/en_US/configure.ini
index 821846434..de7de0c2f 100644
--- a/src/locale/en_US/configure.ini
+++ b/src/locale/en_US/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "IP Address"
 pageoptions_element_result_score = "Result Score"
 pageoptions_element_ranking_factors = "Search Ranking Factors"
 pageoptions_element_host_keyword_bonus = "Host Keyword Bonus:"
+pageoptions_element_title_bonus = "Title Keyword Bonus"
+pageoptions_element_path_keyword_bonus = "Path Keyword Bonus"
 pageoptions_element_cld_url_bonus = "Company Domain Url Bonus:"
 pageoptions_element_host_url_bonus = "Host Url Bonus:"
 pageoptions_element_results_grouping_options = "Search Results Grouping"
diff --git a/src/locale/es/configure.ini b/src/locale/es/configure.ini
index 857ac68d8..3af5caeae 100755
--- a/src/locale/es/configure.ini
+++ b/src/locale/es/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "Direcci&oacute;n IP"
 pageoptions_element_result_score = "Resultado De La Puntuaci&oacute;n"
 pageoptions_element_ranking_factors = "La Clasificaci&oacute;n De La B&uacute;squeda De Los Factores De"
 pageoptions_element_host_keyword_bonus = "Bonificación por palabra clave de anfitrión:"
+pageoptions_element_title_bonus = "Bono por Palabra Clave de Título"
+pageoptions_element_path_keyword_bonus = "Bonificación por Palabra Clave de Ruta"
 pageoptions_element_cld_url_bonus = "Bonificación de Url de Dominio de Empresa:"
 pageoptions_element_host_url_bonus = "Bonificación de URL de Host:"
 pageoptions_element_results_grouping_options = "Resultados De La B&uacute;squeda De La Agrupaci&oacute;n"
diff --git a/src/locale/fa/configure.ini b/src/locale/fa/configure.ini
index 5a0ac797a..026f50d36 100755
--- a/src/locale/fa/configure.ini
+++ b/src/locale/fa/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "آدرس IP"
 pageoptions_element_result_score = "نتیجه نمره"
 pageoptions_element_ranking_factors = "جستجو رتبه بندی عوامل"
 pageoptions_element_host_keyword_bonus = "میزبان کلید واژه پاداش:"
+pageoptions_element_title_bonus = "عنوان کلید واژه پاداش"
+pageoptions_element_path_keyword_bonus = "جایزه کلید واژه مسیر"
 pageoptions_element_cld_url_bonus = "پاداش دامنه شرکت :"
 pageoptions_element_host_url_bonus = "پاداش نشانی وب میزبان :"
 pageoptions_element_results_grouping_options = "دسته&zwnj;بندی نتایج جستجو"
diff --git a/src/locale/fr_FR/configure.ini b/src/locale/fr_FR/configure.ini
index 2f41b84ad..cb0997134 100755
--- a/src/locale/fr_FR/configure.ini
+++ b/src/locale/fr_FR/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "Adresse IP"
 pageoptions_element_result_score = "R&eacute;sultat Score"
 pageoptions_element_ranking_factors = "La recherche des facteurs de classement"
 pageoptions_element_host_keyword_bonus = "Bonus de mot-clé hôte:"
+pageoptions_element_title_bonus = "Titre Mot-Clé Bonus"
+pageoptions_element_path_keyword_bonus = "Bonus de Mot-Clé Path"
 pageoptions_element_cld_url_bonus = "Bonus d&#039;URL de domaine d&#039;entreprise:"
 pageoptions_element_host_url_bonus = "Bonus d&#039;URL d&#039;hôte:"
 pageoptions_element_results_grouping_options = "R&eacute;sultats de la recherche groupement"
diff --git a/src/locale/he/configure.ini b/src/locale/he/configure.ini
index d0929c3a6..636227e35 100755
--- a/src/locale/he/configure.ini
+++ b/src/locale/he/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "כתובת ה-IP"
 pageoptions_element_result_score = "התוצאה ציון"
 pageoptions_element_ranking_factors = "חיפוש גורמי דירוג"
 pageoptions_element_host_keyword_bonus = "בונוס מילות מפתח מארח:"
+pageoptions_element_title_bonus = "בונוס מילת מפתח כותרת"
+pageoptions_element_path_keyword_bonus = "בונוס מילות מפתח נתיב"
 pageoptions_element_cld_url_bonus = "בונוס כתובת אתר של תחום החברה :"
 pageoptions_element_host_url_bonus = "בונוס כתובת אתר מארח :"
 pageoptions_element_results_grouping_options = "תוצאות חיפוש קיבוץ"
diff --git a/src/locale/hi/configure.ini b/src/locale/hi/configure.ini
index f618f0de0..0b1687884 100755
--- a/src/locale/hi/configure.ini
+++ b/src/locale/hi/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "आईपी पते"
 pageoptions_element_result_score = "परिणाम स्कोर"
 pageoptions_element_ranking_factors = "खोज रैंकिंग कारकों"
 pageoptions_element_host_keyword_bonus = "होस्ट कीवर्ड बोनस:"
+pageoptions_element_title_bonus = "शीर्षक कीवर्ड बोनस"
+pageoptions_element_path_keyword_bonus = "पथ कीवर्ड बोनस"
 pageoptions_element_cld_url_bonus = "कंपनी डोमेन यूआरएल बोनस:"
 pageoptions_element_host_url_bonus = "होस्ट यूआरएल बोनस:"
 pageoptions_element_results_grouping_options = "खोज परिणाम समूहीकरण"
diff --git a/src/locale/id/configure.ini b/src/locale/id/configure.ini
index 28f7c84b5..c3b8ae7a5 100755
--- a/src/locale/id/configure.ini
+++ b/src/locale/id/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "Alamat IP"
 pageoptions_element_result_score = "Hasil Skor"
 pageoptions_element_ranking_factors = "Pencarian Peringkat Faktor-Faktor"
 pageoptions_element_host_keyword_bonus = "Bonus Kata Kunci Host:"
+pageoptions_element_title_bonus = "Bonus Kata Kunci Judul"
+pageoptions_element_path_keyword_bonus = "Bonus Kata Kunci Jalur"
 pageoptions_element_cld_url_bonus = "Bonus Url Domain Perusahaan:"
 pageoptions_element_host_url_bonus = "Bonus URL Tuan Rumah:"
 pageoptions_element_results_grouping_options = "Hasil Pencarian Pengelompokan"
diff --git a/src/locale/it/configure.ini b/src/locale/it/configure.ini
index 750fc9507..d3604d256 100755
--- a/src/locale/it/configure.ini
+++ b/src/locale/it/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "Indirizzo IP"
 pageoptions_element_result_score = "Risultato Punteggio"
 pageoptions_element_ranking_factors = "Fattori del posto"
 pageoptions_element_host_keyword_bonus = "Bonus parole chiave host:"
+pageoptions_element_title_bonus = "Titolo Parola chiave Bonus"
+pageoptions_element_path_keyword_bonus = "Bonus parole chiave percorso"
 pageoptions_element_cld_url_bonus = "Bonus Url dominio azienda:"
 pageoptions_element_host_url_bonus = "Bonus Url host:"
 pageoptions_element_results_grouping_options = "Raggruppa risultati di ricerca"
diff --git a/src/locale/ja/configure.ini b/src/locale/ja/configure.ini
index 43eede052..599110c25 100755
--- a/src/locale/ja/configure.ini
+++ b/src/locale/ja/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "IPアドレス"
 pageoptions_element_result_score = "スコア結果"
 pageoptions_element_ranking_factors = "検索ランキング因子"
 pageoptions_element_host_keyword_bonus = "ホストキーワードボーナス:"
+pageoptions_element_title_bonus = "タイトルキーワードボーナス"
+pageoptions_element_path_keyword_bonus = "パスキーワードボーナス"
 pageoptions_element_cld_url_bonus = "企業ドメインUrlボーナス:"
 pageoptions_element_host_url_bonus = "ホストUrlボーナス:"
 pageoptions_element_results_grouping_options = "検索結果の分類"
diff --git a/src/locale/kn/configure.ini b/src/locale/kn/configure.ini
index 9b0264d04..55dc2e465 100755
--- a/src/locale/kn/configure.ini
+++ b/src/locale/kn/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "IP ವಿಳಾಸ"
 pageoptions_element_result_score = "ಪರಿಣಾಮವಾಗಿ ಸ್ಕೋರ್"
 pageoptions_element_ranking_factors = "ಹುಡುಕು ಶ್ರೇಯಾಂಕ ಅಂಶಗಳು"
 pageoptions_element_host_keyword_bonus = "ಹೋಸ್ಟ್ ಕೀವರ್ಡ್ ಬೋನಸ್:"
+pageoptions_element_title_bonus = "ಶೀರ್ಷಿಕೆ ಕೀವರ್ಡ್ ಬೋನಸ್"
+pageoptions_element_path_keyword_bonus = "ಪಾತ್ ಕೀವರ್ಡ್ ಬೋನಸ್"
 pageoptions_element_cld_url_bonus = "ಕಂಪನಿ ಡೊಮೇನ್ ಯುಆರ್ಎಲ್ ಬೋನಸ್:"
 pageoptions_element_host_url_bonus = "ಹೋಸ್ಟ್ ಯುಆರ್ಎಲ್ ಬೋನಸ್:"
 pageoptions_element_results_grouping_options = "ಹುಡುಕಾಟ ಫಲಿತಾಂಶಗಳು ಗುಂಪು"
diff --git a/src/locale/ko/configure.ini b/src/locale/ko/configure.ini
index 3cba4b204..e0138cd71 100755
--- a/src/locale/ko/configure.ini
+++ b/src/locale/ko/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "IP 주소"
 pageoptions_element_result_score = "그 결과 점수"
 pageoptions_element_ranking_factors = "검색 순위 요소"
 pageoptions_element_host_keyword_bonus = "호스트 키워드 보너스:"
+pageoptions_element_title_bonus = "제목 키워드 보너스"
+pageoptions_element_path_keyword_bonus = "경로 키워드 보너스"
 pageoptions_element_cld_url_bonus = "회사 도메인 주소 보너스:"
 pageoptions_element_host_url_bonus = "호스트 주소 보너스:"
 pageoptions_element_results_grouping_options = "검색 결과 그룹"
diff --git a/src/locale/nl/configure.ini b/src/locale/nl/configure.ini
index 5b8c4b5de..a9505aa44 100644
--- a/src/locale/nl/configure.ini
+++ b/src/locale/nl/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "IP adres"
 pageoptions_element_result_score = "Resultaat Score"
 pageoptions_element_ranking_factors = "Zoek Ranking Factors"
 pageoptions_element_host_keyword_bonus = "Host Keyword Bonus :"
+pageoptions_element_title_bonus = "Titel Trefwoord Bonus"
+pageoptions_element_path_keyword_bonus = "Pad Trefwoord Bonus"
 pageoptions_element_cld_url_bonus = "Bedrijfsdomein Url-Bonus :"
 pageoptions_element_host_url_bonus = "Host Url Bonus :"
 pageoptions_element_results_grouping_options = "Zoekresultaten Groepering"
diff --git a/src/locale/pl/configure.ini b/src/locale/pl/configure.ini
index 14d82f2ea..143262947 100755
--- a/src/locale/pl/configure.ini
+++ b/src/locale/pl/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "Adres IP"
 pageoptions_element_result_score = "Wynik Wynik "
 pageoptions_element_ranking_factors = "Czynniki Rankingu Wyszukiwania "
 pageoptions_element_host_keyword_bonus = "Host Keyword Bonus:"
+pageoptions_element_title_bonus = "Tytuł Keyword Bonus"
+pageoptions_element_path_keyword_bonus = "Path Keyword Bonus"
 pageoptions_element_cld_url_bonus = "Bonus URL Domeny Firmowej :"
 pageoptions_element_host_url_bonus = "Bonus URL Hosta:"
 pageoptions_element_results_grouping_options = "Wyniki Wyszukiwania Grupowanie"
diff --git a/src/locale/pt/configure.ini b/src/locale/pt/configure.ini
index 6d2ebbceb..7ca1eefe0 100755
--- a/src/locale/pt/configure.ini
+++ b/src/locale/pt/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "Endere&ccedil;o IP"
 pageoptions_element_result_score = "Resultado Da Pontua&ccedil;&atilde;o"
 pageoptions_element_ranking_factors = "Pesquisa De Fatores De Ranking"
 pageoptions_element_host_keyword_bonus = "Bônus De Palavra-Chave Do Host:"
+pageoptions_element_title_bonus = "Título Bonus Palavra-Chave"
+pageoptions_element_path_keyword_bonus = "Bônus De Palavra-Chave Do Caminho"
 pageoptions_element_cld_url_bonus = "Bônus De Url De Domínio Da Empresa:"
 pageoptions_element_host_url_bonus = "Bônus De URL Do Host :"
 pageoptions_element_results_grouping_options = "Resultados Da Pesquisa De Agrupamento"
diff --git a/src/locale/ru/configure.ini b/src/locale/ru/configure.ini
index 11d83aef9..baf724223 100755
--- a/src/locale/ru/configure.ini
+++ b/src/locale/ru/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "IP-адрес"
 pageoptions_element_result_score = "Результат Результат "
 pageoptions_element_ranking_factors = "Факторы Ранжирования Поиска "
 pageoptions_element_host_keyword_bonus = "Бонус за ключевое слово хоста:"
+pageoptions_element_title_bonus = "Бонус за ключевое слово в названии"
+pageoptions_element_path_keyword_bonus = "Бонус за ключевое слово Path"
 pageoptions_element_cld_url_bonus = "Бонусный URL-адрес домена компании:"
 pageoptions_element_host_url_bonus = "Бонусный URL-адрес хоста:"
 pageoptions_element_results_grouping_options = "Результаты Поиска Группировка"
diff --git a/src/locale/te/configure.ini b/src/locale/te/configure.ini
index ebe05c834..da79e7ea4 100644
--- a/src/locale/te/configure.ini
+++ b/src/locale/te/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "IP అడ్రస్"
 pageoptions_element_result_score = "ఫలితంగా స్కోరు"
 pageoptions_element_ranking_factors = "శోధన ర్యాంకింగ్ కారకాలు"
 pageoptions_element_host_keyword_bonus = "హోస్ట్ కీవర్డ్ బోనస్:"
+pageoptions_element_title_bonus = "శీర్షిక కీవర్డ్ బోనస్"
+pageoptions_element_path_keyword_bonus = "మార్గం కీవర్డ్ బోనస్"
 pageoptions_element_cld_url_bonus = "కంపెనీ డొమైన్ యూఆర్ఎల్ బోనస్:"
 pageoptions_element_host_url_bonus = "హోస్ట్ యూఆర్ఎల్ బోనస్:"
 pageoptions_element_results_grouping_options = "శోధన ఫలితాలు చోట"
diff --git a/src/locale/th/configure.ini b/src/locale/th/configure.ini
index 78341b5bd..ebc3acdc7 100755
--- a/src/locale/th/configure.ini
+++ b/src/locale/th/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "ที่อยู่ IP"
 pageoptions_element_result_score = "ผลคะแนน"
 pageoptions_element_ranking_factors = "การค้นหาปัจจัยระดับสูงนอก"
 pageoptions_element_host_keyword_bonus = "โบนัสคำหลักโฮสต์:"
+pageoptions_element_title_bonus = "ชื่อโบนัสคำสำคัญ"
+pageoptions_element_path_keyword_bonus = "โบนัสคำหลักเส้นทาง"
 pageoptions_element_cld_url_bonus = "โบนัสโดเมนบริษัท:"
 pageoptions_element_host_url_bonus = "โบนัสโฮสต์:"
 pageoptions_element_results_grouping_options = "ผลการค้นหาการจัดกลุ่ม"
diff --git a/src/locale/tl/configure.ini b/src/locale/tl/configure.ini
index 2fe90febc..dfc3bacb1 100644
--- a/src/locale/tl/configure.ini
+++ b/src/locale/tl/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "IP Address"
 pageoptions_element_result_score = "Resulta Puntos"
 pageoptions_element_ranking_factors = "Maghanap Ng Pagra-Ranggo Ng Mga Kadahilanan"
 pageoptions_element_host_keyword_bonus = "Mag-Host Ng Keyword Bonus:"
+pageoptions_element_title_bonus = "Bonus Ng Keyword Ng Pamagat"
+pageoptions_element_path_keyword_bonus = "Path Keyword Bonus"
 pageoptions_element_cld_url_bonus = "Bonus Ng Url Ng Domain Ng Kumpanya:"
 pageoptions_element_host_url_bonus = "Host Url Bonus:"
 pageoptions_element_results_grouping_options = "Mga Resulta Ng Paghahanap Ng Pagpapangkat"
diff --git a/src/locale/tr/configure.ini b/src/locale/tr/configure.ini
index 35f7e6ce3..5800d39bc 100755
--- a/src/locale/tr/configure.ini
+++ b/src/locale/tr/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "IP Adresi"
 pageoptions_element_result_score = "Sonu&ccedil; Puan"
 pageoptions_element_ranking_factors = "Arama Sıralaması Fakt&ouml;rler"
 pageoptions_element_host_keyword_bonus = "Ana Anahtar Kelime Bonusu :"
+pageoptions_element_title_bonus = "Başlık Anahtar Kelime Bonusu"
+pageoptions_element_path_keyword_bonus = "Path Anahtar Kelime Bonusu"
 pageoptions_element_cld_url_bonus = "Şirket Alan Adı Url Bonusu :"
 pageoptions_element_host_url_bonus = "Ana Bilgisayar Url Bonusu :"
 pageoptions_element_results_grouping_options = "Arama Sonu&ccedil;ları Gruplandırma"
diff --git a/src/locale/vi_VN/configure.ini b/src/locale/vi_VN/configure.ini
index 0c257af60..1bf797710 100755
--- a/src/locale/vi_VN/configure.ini
+++ b/src/locale/vi_VN/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "Địa chỉ IP"
 pageoptions_element_result_score = "Quả Điểm"
 pageoptions_element_ranking_factors = "Tìm Bảng Xếp Hạng Các Yếu Tố"
 pageoptions_element_host_keyword_bonus = "Máy Chủ Từ Khóa Thưởng:"
+pageoptions_element_title_bonus = "Tiêu Đề Tiền Thưởng Từ Khóa"
+pageoptions_element_path_keyword_bonus = "Con Đường Từ Khóa Tiền Thưởng"
 pageoptions_element_cld_url_bonus = "Công Ty Miền Url Thưởng:"
 pageoptions_element_host_url_bonus = "Chủ Url Thưởng:"
 pageoptions_element_results_grouping_options = "Kết Quả Tìm Kiếm Nhóm"
diff --git a/src/locale/vi_VN/statistics.txt b/src/locale/vi_VN/statistics.txt
index b6bef56f0..5a165df53 100755
--- a/src/locale/vi_VN/statistics.txt
+++ b/src/locale/vi_VN/statistics.txt
@@ -1 +1 @@
-d:99;
\ No newline at end of file
+d:100;
\ No newline at end of file
diff --git a/src/locale/zh_CN/configure.ini b/src/locale/zh_CN/configure.ini
index 40067c900..4523d4b4b 100755
--- a/src/locale/zh_CN/configure.ini
+++ b/src/locale/zh_CN/configure.ini
@@ -1387,6 +1387,8 @@ pageoptions_element_ip_link = "IP地址"
 pageoptions_element_result_score = "结果得分"
 pageoptions_element_ranking_factors = "搜索排的因素"
 pageoptions_element_host_keyword_bonus = "主机关键字奖励:"
+pageoptions_element_title_bonus = "标题关键字奖金"
+pageoptions_element_path_keyword_bonus = "路径关键字奖励"
 pageoptions_element_cld_url_bonus = "公司域名Url奖金:"
 pageoptions_element_host_url_bonus = "主机Url奖金:"
 pageoptions_element_results_grouping_options = "搜索结果的分组"
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index 77e0a1b8b..2e9c6a65f 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -203,14 +203,15 @@ class PhraseModel extends ParallelModel
      * @param int $save_timestamp if this timestamp is nonzero, then save
      *     iterate position, so can resume on future queries that make
      *     use of the timestamp
-     *
+     * @param array $ranking_factors field say how url, keywords, and
+     *     title words should influence relevance and doc rank calculations
      * @return array an array of summary data
      */
     public function getPhrasePageResults(
         $input_phrase, $low = 0, $results_per_page = C\NUM_RESULTS_PER_PAGE,
         $format = true, $filter = null, $use_cache_if_allowed = true,
         $raw = 0, $queue_servers = [], $guess_semantics = true,
-        $save_timestamp = 0)
+        $save_timestamp = 0, $ranking_factors = [])
     {
         if (C\QUERY_STATISTICS) {
             $indent= "&nbsp;&nbsp;";
@@ -416,7 +417,7 @@ class PhraseModel extends ParallelModel
             $out_results = $this->getSummariesByHash($word_structs,
                 $low, $phrase_num, $filter, $use_cache_if_allowed, $raw,
                 $queue_servers, $phrase, $save_timestamp_name,
-                $format_words);
+                $format_words, $ranking_factors);
             if (isset($out_results['PAGES']) &&
                 count($out_results['PAGES']) != 0) {
                 $out_count = 0;
@@ -1122,12 +1123,14 @@ class PhraseModel extends ParallelModel
      *     docs after $save_timestamp 's previous iterate position.
      * @param array $format_words words which should be highlighted in
      *     search snippets returned
+     * @param array $ranking_factors field say how url, keywords, and
+     *     title words should influence relevance and doc rank calculations
      * @return array document summaries
      */
     public function getSummariesByHash($word_structs, $limit, $num, $filter,
         $use_cache_if_allowed = true, $raw = 0, $queue_servers = [],
         $original_query = "", $save_timestamp_name = "",
-        $format_words = null)
+        $format_words = null, $ranking_factors = [])
     {
         $indent= "&nbsp;&nbsp;";
         $in2 = $indent . $indent;
@@ -1138,10 +1141,10 @@ class PhraseModel extends ParallelModel
         }
         $use_proximity = false;
         $time = time();
-        if (count($word_structs) > 1 || (isset($word_structs[0]["KEYS"])
-            && count($word_structs[0]["KEYS"]) > 1) ||
-            ($word_structs == [] &&
-            substr_count($original_query, " ") > 1)) {
+        $test_query = trim(preg_replace("/\s+/u", " ", $original_query));
+        $approx_query_terms = substr_count($test_query, " ") -
+            substr_count($test_query, ":") + 1;
+        if ($approx_query_terms > 1) {
             $use_proximity = true;
         }
         if (empty($filter)) {
@@ -1226,7 +1229,7 @@ class PhraseModel extends ParallelModel
         $get_query_time = microtime(true);
         $query_iterator = $this->getQueryIterator($word_structs, $filter, $raw,
             $to_retrieve, $queue_servers, $original_query,
-            $save_timestamp_name);
+            $save_timestamp_name, $ranking_factors);
         $get_query_time = L\changeInMicrotime($get_query_time);
         $num_retrieved = 0;
         $pages = [];
@@ -1302,33 +1305,42 @@ class PhraseModel extends ParallelModel
                 }
             }
             $num_fields = count($subscore_fields);
-            // Compute Reciprocal Rank Fusion Score
-            $alpha = 400/$num_fields;
-            if (isset($pages[0])) {
-                foreach ($subscore_fields as $field) {
-                    L\orderCallback($pages[0], $pages[0], $field);
-                    usort($pages, C\NS_LIB . "orderCallback");
-                    $score = 0;
-                    for ($i = 0; $i < $result_count; $i++) {
-                        if ($i > 0) {
-                            if ($pages[$i - 1][$field] != $pages[$i][$field]) {
-                                $score++;
+            if ($num_fields > 1) {
+                // Compute Reciprocal Rank Fusion Score
+                $alpha = 400/$num_fields;
+                if (isset($pages[0])) {
+                    foreach ($subscore_fields as $field) {
+                        L\orderCallback($pages[0], $pages[0], $field);
+                        usort($pages, C\NS_LIB . "orderCallback");
+                        $score = 0;
+                        for ($i = 0; $i < $result_count; $i++) {
+                            if ($i > 0) {
+                                if ($pages[$i - 1][$field] !=
+                                    $pages[$i][$field]) {
+                                    $score++;
+                                }
                             }
+                            $pages[$i]["OUT_SCORE"] += $alpha/(59 + $score);
                         }
-                        $pages[$i]["OUT_SCORE"] += $alpha/(59 + $score);
                     }
+                    L\orderCallback($pages[0], $pages[0], "OUT_SCORE");
                 }
-                L\orderCallback($pages[0], $pages[0], "OUT_SCORE");
-            }
-            usort($pages, C\NS_LIB . "orderCallback");
-            if ($use_proximity) {
-                for ($i = 0; $i < $result_count; $i++) {
-                    $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"];
+                usort($pages, C\NS_LIB . "orderCallback");
+                if ($use_proximity) {
+                    for ($i = 0; $i < $result_count; $i++) {
+                        $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"];
+                    }
+                } else {
+                    for ($i = 0; $i < $result_count; $i++) {
+                        $pages[$i][self::PROXIMITY] = 0;
+                        $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"];
+                    }
                 }
-            } else {
+            } else if(isset($pages[0])) {
+                L\orderCallback($pages[0], $pages[0], self::SCORE);
+                usort($pages, C\NS_LIB . "orderCallback");
                 for ($i = 0; $i < $result_count; $i++) {
-                    $pages[$i][self::PROXIMITY] = 1;
-                    $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"];
+                    $pages[$i][self::PROXIMITY] = 0;
                 }
             }
             $sort_time = L\changeInMicrotime($sort_start);
@@ -1691,13 +1703,14 @@ class PhraseModel extends ParallelModel
      * @param string $save_timestamp_name if this timestamp is non empty, then
      *     when making iterator get sub-iterators to advance to gen doc_offset
      *     stored with respect to save_timestamp if exists.
-     *
+     * @param array $ranking_factors field say how url, keywords, and
+     *     title words should influence relevance and doc rank calculations
      * @return object an iterator for iterating through results to the
      * query
      */
     public function getQueryIterator($word_structs, $filter, $raw,
         &$to_retrieve, $queue_servers = [], $original_query = "",
-        $save_timestamp_name = "")
+        $save_timestamp_name = "", $ranking_factors = [])
     {
         $iterators = [];
         $total_iterators = 0;
@@ -1721,7 +1734,8 @@ class PhraseModel extends ParallelModel
                 $index_name = $this->index_name;
             }
             $iterators[0] = new I\NetworkIterator($original_query,
-                $queue_servers, $index_name, $filter, $save_timestamp_name);
+                $queue_servers, $index_name, $filter, $save_timestamp_name,
+                $ranking_factors);
         }
         if (!$network_flag) {
             $index_name = $this->index_name ?? "";
@@ -1813,7 +1827,7 @@ class PhraseModel extends ParallelModel
                         $word_iterators[$i] =
                             new I\WordIterator($distinct_key_id,
                             $actual_index_name, true, $filter, $to_retrieve,
-                            $direction);
+                            $direction, $ranking_factors);
                         $min_group_override = true;
                     }
                     foreach ($word_keys as $index => $key) {
@@ -1836,7 +1850,7 @@ class PhraseModel extends ParallelModel
                             new I\WordIterator($disallow_keys[$i],
                                 $actual_index_name, false, $filter,
                                 I\IndexBundleIterator::RESULTS_PER_BLOCK,
-                                $direction);
+                                $direction, $ranking_factors);
                         $word_iterators[$num_word_keys + $i] =
                             new I\NegationIterator($disallow_iterator);
                     }
diff --git a/src/models/ProfileModel.php b/src/models/ProfileModel.php
index 81310fd9f..47732e571 100755
--- a/src/models/ProfileModel.php
+++ b/src/models/ProfileModel.php
@@ -60,13 +60,14 @@ class ProfileModel extends Model
         'LOGO_LARGE', 'MAIL_PASSWORD',  'MAIL_SECURITY',
         'MAIL_SENDER', 'MAIL_SERVER', 'MAIL_SERVERPORT', 'MAIL_USERNAME',
         'MIN_RESULTS_TO_GROUP',  'MONETIZATION_TYPE', 'MORE_RESULT',
-        'MEDIA_MODE', 'NAME_SERVER', 'PRIVATE_DB_NAME', 'PRIVATE_DB_HOST',
+        'MEDIA_MODE', 'NAME_SERVER', 'PATH_KEYWORD_BONUS',
+        'PRIVATE_DB_NAME', 'PRIVATE_DB_HOST',
         'PRIVATE_DBMS', 'PRIVATE_DB_PASSWORD', 'PRIVATE_DB_USER',
         'PROXY_SERVERS', 'RECOVERY_MODE', 'REGISTRATION_TYPE', 'RESULT_SCORE',
         'ROBOT_INSTANCE','RSS_ACCESS', 'SEARCH_ANALYTICS_MODE',
         'SEARCHBAR_PATH', 'SEND_MAIL_MEDIA_UPDATER',
         'SESSION_NAME', 'SIDE_ADSCRIPT', 'SIDEBAR_COLOR', 'SIGNIN_LINK',
-        'SIMILAR_LINK', 'SUBSEARCH_LINK', 'TIMEZONE',
+        'SIMILAR_LINK', 'SUBSEARCH_LINK', 'TITLE_BONUS', 'TIMEZONE',
         'TOPBAR_COLOR', 'TOP_ADSCRIPT','TOR_PROXY', 'USE_FILECACHE',
         'USE_MAIL_PHP', 'USE_PROXY', 'USER_AGENT_SHORT', 'WEB_URI',
         'WEB_ACCESS', 'WORD_CLOUD', 'WORD_SUGGEST'
diff --git a/src/views/elements/PageoptionsElement.php b/src/views/elements/PageoptionsElement.php
index 86fa57ba0..f4ab25b4b 100644
--- a/src/views/elements/PageoptionsElement.php
+++ b/src/views/elements/PageoptionsElement.php
@@ -523,6 +523,16 @@ class PageOptionsElement extends Element
             <input type="text" id="host-keyword-bonus" class="very-narrow-field"
                 maxlength="<?= C\NUM_FIELD_LEN ?>" name="HOST_KEYWORD_BONUS"
                 value="<?= $data['HOST_KEYWORD_BONUS']  ?>" /></td></tr>
+        <tr><th><label for="title-bonus"><?=
+            tl('pageoptions_element_title_bonus') ?></label></th><td>
+            <input type="text" id="title-bonus" class="very-narrow-field"
+                maxlength="<?= C\NUM_FIELD_LEN ?>" name="TITLE_BONUS"
+                value="<?= $data['TITLE_BONUS']  ?>" /></td></tr>
+        <tr><th><label for="path-keyword-bonus"><?=
+            tl('pageoptions_element_path_keyword_bonus') ?></label></th><td>
+            <input type="text" id="path-keyword-bonus" class="very-narrow-field"
+                maxlength="<?= C\NUM_FIELD_LEN ?>" name="PATH_KEYWORD_BONUS"
+                value="<?= $data['PATH_KEYWORD_BONUS']  ?>" /></td></tr>
         <tr><th><label for="cld-url-bonus"><?=
             tl('pageoptions_element_cld_url_bonus')?></label></th><td>
             <input type="text" id="cld-url-bonus" class="very-narrow-field"
diff --git a/src/views/elements/SearchElement.php b/src/views/elements/SearchElement.php
index 363d5b777..21743e387 100644
--- a/src/views/elements/SearchElement.php
+++ b/src/views/elements/SearchElement.php
@@ -332,8 +332,11 @@ class SearchElement extends Element implements CrawlConstants
                             number_format($page[self::DOC_RANK], 2)) . "\n");
                         e(tl('search_element_relevancy',
                             number_format($page[self::RELEVANCE], 2) ) . "\n");
-                        e(tl('search_element_proximity',
-                            number_format($page[self::PROXIMITY], 2) ) . "\n");
+                        if ($page[self::PROXIMITY] > 0) {
+                            e(tl('search_element_proximity',
+                                number_format($page[self::PROXIMITY], 2)
+                                ) . "\n");
+                        }
                         if (isset($page[self::USER_RANKS])) {
                             foreach ($page[self::USER_RANKS] as
                                 $label => $score) {
diff --git a/tests/IndexDocumentBundleTest.php b/tests/IndexDocumentBundleTest.php
index d47c770b7..d06fa31ad 100644
--- a/tests/IndexDocumentBundleTest.php
+++ b/tests/IndexDocumentBundleTest.php
@@ -231,11 +231,12 @@ use seekquarry\yioop\library\UnitTest;
         foreach ($keys as $key) {
             $row = $doc_map_tools->find($doc_map, $key);
             $entry = $doc_map_tools->unpack($row);
-            $title_length = str_word_count(UrlParser::getWordsInHostUrl(
+            $preface_length = str_word_count(UrlParser::getWordsInHostUrl(
                     $docs[$i][CC::SUMMARY][CC::URL]) . " " .
-                    $docs[$i][CC::SUMMARY][CC::TITLE]) + 2;
-            $this->assertEqual($title_length, $entry[1]["POS"],
-                "Doc $i title length matches calculated.");
+                    $docs[$i][CC::SUMMARY][CC::TITLE]) + 3;
+            $entry_preface_length = ($entry[1]["POS"] & 255);
+            $this->assertEqual($preface_length, $entry_preface_length,
+                "Doc $i preface length matches calculated.");
             $i++;
         }
         $posting_tools = $this->index_archive->postings_tools;
@@ -257,7 +258,7 @@ use seekquarry\yioop\library\UnitTest;
             $entry[0]['FREQUENCY']);
         $host_word_count = str_word_count(UrlParser::getWordsInHostUrl(
             $docs[1][CC::SUMMARY][CC::URL]));
-        $this->assertEqual($position_list[0], $host_word_count + 4,
+        $this->assertEqual($position_list[0], 1 + $host_word_count + 1 + 3,
             "Test Position List Decode");
     }
     /**
@@ -327,7 +328,7 @@ use seekquarry\yioop\library\UnitTest;
                 $last += $postings[$j]['POSITIONS_OFFSET'];
                 $position_list = L\decodePositionList($encoded_positions,
                     $postings[$j]['FREQUENCY']);
-                $this->assertEqual($position_list[0], 7,
+                $this->assertEqual($position_list[0], 9,
                     "Test Position List Decode");
             }
         }
ViewGit