Add wiki pages for last change, add a new constant SITEMAP_TIER_PENALTY for tier in queue calc for sitemaps, a=chris

Chris Pollett [2022-07-30 02:Jul:th]
Add wiki pages for last change, add a new constant SITEMAP_TIER_PENALTY for tier in queue calc for sitemaps, a=chris
Filename
src/configs/Config.php
src/configs/PublicHelpPages.php
src/data/public_default.db
src/library/CrawlQueueBundle.php
src/models/PhraseModel.php
diff --git a/src/configs/Config.php b/src/configs/Config.php
index da1211135..2068808ca 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -727,8 +727,11 @@ nsconddefine('MAX_WAITING_HOSTS', 250);
  * before delete from queue new crawl-delayed urls
  */
 nsconddefine('WAITING_URL_FRACTION', 0.1);
-/** Minimum weight in priority queue before rebuild */
-nsconddefine('MIN_QUEUE_WEIGHT', 83);
+/** Sitemap urls (ending .gz, .bz. xml) are added to host budgeting queue
+ * with this penalty to their tier level. (So if tier would have been x
+ * with penalty will be x + SITEMAP_TIER_PENALTY)
+ */
+nsconddefine('SITEMAP_TIER_PENALTY', 4);
 /**  largest sized object allowed in a web archive (used to sanity check
  *  reading data out of a web archive)
  */
diff --git a/src/configs/PublicHelpPages.php b/src/configs/PublicHelpPages.php
index 7ae6984b9..bbb0e4815 100644
--- a/src/configs/PublicHelpPages.php
+++ b/src/configs/PublicHelpPages.php
@@ -42608,8 +42608,10 @@ END_HEAD_VARSThe score used to rank a page is computed as the document rank scor
 ; '''Host Keyword Bonus''' :  Potential bonus to add to relevance score. The number of occurrences of search term divided by the number of host name keywords is the fraction of this bonus that will be added to the relevance score.
 ; '''Title Bonus''' :  Potential bonus to add to relevance score. The number of occurrences of search term divided by the number of words in the title is the fraction of this bonus that will be added to the relevance score.
 ; '''Path Bonus''' :  Potential bonus to add to relevance score. The number of occurrences of search term divided by the number of words in the path portion of the url is the fraction of this bonus that will be added to the relevance score.
+; '''Proximity Bonus''' : Proximity scores for multi-term queries are normalized between 0 and 1, this is the weighting factor to multiply that basic score before adding it to the overall score used for ranking
 ; '''CLD Url Bonus''' : Bonus to add to doc rank score if the url is a company level domain.
 ; '''Host Url Bonus''' : Bonus to add to doc rank score if the url is a a hostname.
+; '''User Rank  Bonus''' : User rank scores (created by making a classifier) for a document are normalized between 0 and 1, this is the weighting factor to multiply that basic score before adding it to the overall score used for ranking.


 EOD;
diff --git a/src/data/public_default.db b/src/data/public_default.db
index 6a6900c34..19c410e87 100644
Binary files a/src/data/public_default.db and b/src/data/public_default.db differ
diff --git a/src/library/CrawlQueueBundle.php b/src/library/CrawlQueueBundle.php
index 96f87acc7..eb6ad0fff 100644
--- a/src/library/CrawlQueueBundle.php
+++ b/src/library/CrawlQueueBundle.php
@@ -387,7 +387,7 @@ class CrawlQueueBundle
         $robots_txt = "robots.txt";
         // put sitemaps in a higher queue
         if (in_array(substr($url, -3), [".gz", ".bz", "xml"])) {
-            $tier += 2;
+            $tier += C\SITEMAP_TIER_PENALTY;
         } else if (UrlParser::guessMimeTypeFromFileName($url) != "text/html"
             && substr($url, -strlen($robots_txt)) != $robots_txt) {
             //slightly penalize non html documents
@@ -596,9 +596,15 @@ class CrawlQueueBundle
             }
             return ($is_empty) ? false : $sub_dir;
         }
+        /* the hope of the following is to prevent looking at sitemaps
+           too early in the crawl before all the seed sites are donwloaded
+         */
         $exp_max_folder++;
-        // the hope is this prevents looking at sitemaps too quickly
-        $max_folder = min(count($sub_dirs), ceil(log($exp_max_folder, 2)));
+        $pre_max_folder = ceil(log($exp_max_folder, 2));
+        if ($pre_max_folder >=  C\SITEMAP_TIER_PENALTY) {
+            $pre_max_folder = count($sub_dirs);
+        }
+        $max_folder = min(count($sub_dirs), $pre_max_folder);
         $last_folder = ($last_folder < $max_folder - 1) ?
             $last_folder + 1 : 0;
         return $sub_dirs[$last_folder];
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index 7ae9d89ad..ad25b2824 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -1276,12 +1276,11 @@ class PhraseModel extends ParallelModel
             $ranking_factors["PROXIMITY_BONUS"] ??= C\PROXIMITY_BONUS;
             $ranking_factors["USER_RANK_BONUS"] ??= C\USER_RANK_BONUS;
             for ($i = 0; $i < $result_count; $i++) {
-                $pages[$i]["OUT_SCORE"] = $pages[$i][self::SCORE];
                 $pages[$i][self::PROXIMITY] ??= 0;
                 $pages[$i][self::PROXIMITY] *=
                     $ranking_factors["PROXIMITY_BONUS"];
                 if ($use_proximity) {
-                    $pages[$i]["OUT_SCORE"] += $pages[$i][self::PROXIMITY];
+                    $pages[$i][self::SCORE] += $pages[$i][self::PROXIMITY];
                 }
                 if (isset($pages[$i][self::USER_RANKS])) {
                     $j = count($pages[$i][self::USER_RANKS]);
@@ -1299,7 +1298,7 @@ class PhraseModel extends ParallelModel
                                 floatval(number_format(
                                 $pages[$i][self::USER_RANKS][$j]['SCORE'] ?? 0,
                                 4, '.', ''));
-                            $pages[$i]["OUT_SCORE"] += $pages[$i]["USCORE$j"];
+                            $pages[$i][self::SCORE] += $pages[$i]["USCORE$j"];
                         } else {
                             // might want to print all scores
                             $pages[$i]["USCORE$j"] = 0;
@@ -1307,7 +1306,7 @@ class PhraseModel extends ParallelModel
                     }
                 }
             }
-            L\orderCallback($pages[0], $pages[0], "OUT_SCORE");
+            L\orderCallback($pages[0], $pages[0], self::SCORE);
             usort($pages, C\NS_LIB . "orderCallback");
             $sort_time = L\changeInMicrotime($sort_start);
         }
ViewGit