Adds a notion of fetch_weight to estimate the likely return size of a fetch batch

Chris Pollett [2023-09-04 00:Sep:th]

Adds a notion of fetch_weight to estimate the likely return size of a fetch batch

Filename
src/configs/Config.php
src/executables/QueueServer.php
src/library/UrlParser.php

diff --git a/src/configs/Config.php b/src/configs/Config.php
index e6b48cb2d..e583469ee 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -1069,6 +1069,15 @@ nsconddefine('DOWNLOAD_TIME_INTERVAL', 0.5);
  *  as multiple of MEMORY_PROFILE
  */
 nsconddefine('MAX_FETCH_SIZE', ceil(MEMORY_PROFILE * 416));
+/**
+ * A second way to ensure that not too much memory is consumed in the
+ * fetcher by downloaded web pages is to try to estimate on the high side
+ * the number bytes each url document is like to consume. This is done in
+ * UrlParser::guessFileSizeFromUrl($url). One can sum the estimated bytes for
+ * the urls in a fetch batch and check that it is less than MAX_FETCH_WEIGHT
+ * below. (Set to 0.25 of the Fetcher's memory size)
+ */
+nsconddefine('MAX_FETCH_WEIGHT', MEMORY_PROFILE * 166666666);
 /**
  * maximum number url queue files to process in trying to create a
  * fetch batch from a tier queue
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 76e2c3ac0..6fa62ba23 100644
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -2267,6 +2267,7 @@ class QueueServer implements CrawlConstants
     public function produceFetchBatch()
     {
         $fetch_size = 0;
+        $fetch_weight = 0;
         L\crawlLog("FB Scheduler: Start Produce Fetch Batch.");
         L\crawlLog("FB Crawl Time is: ". $this->crawl_time);
         L\crawlLog("FB Memory usage is " . memory_get_usage() );
@@ -2308,6 +2309,7 @@ class QueueServer implements CrawlConstants
         $examined_count = 0;
         $num_files_processed = 0;
         while ($fetch_size < C\MAX_FETCH_SIZE &&
+            $fetch_weight < C\MAX_FETCH_WEIGHT &&
             $num_files_processed < C\MAX_FILES_PROCESS_BATCH) {
             while (empty($url_tuples) && !empty($day_folders)) {
                 if (empty($url_files)) {
@@ -2378,6 +2380,7 @@ class QueueServer implements CrawlConstants
                     $sites[$next_slot] = [$url, $weight, $depth, 0, $referer];
                     $current_crawl_index++;
                     $fetch_size++;
+                    $fetch_weight += UrlParser::guessFileSizeFromUrl($url);
                 } else { //no more available slots so prepare to bail
                     $url_tuples[$tuple_index] = $current_tuple;
                     break;
@@ -2469,6 +2472,7 @@ class QueueServer implements CrawlConstants
                     $sites[$next_slot] = [$url, $weight, $depth,
                         $delay, $referer];
                     $fetch_size++;
+                    $fetch_weight += UrlParser::guessFileSizeFromUrl($url);
                 } else {
                     $reschedule_tuples[] = $current_tuple;
                     if (count($reschedule_tuples) > $max_buffer_before_write) {
@@ -2495,12 +2499,17 @@ class QueueServer implements CrawlConstants
                     $sites[$next_slot] = [$url, $weight, $depth, 0, $referer];
                     $current_crawl_index++;
                     $fetch_size++;
+                    $fetch_weight += UrlParser::guessFileSizeFromUrl($url);
                 } else { //no more available slots so prepare to bail
                     $url_tuples[$tuple_index] = $current_tuple;
                     break;
                 }
             } //no crawl-delay else
         } //end while
+        L\crawlLog("FB...Sheduler Final estimated fetch weight of selected ".
+            "urls $fetch_weight");
+        L\crawlLog("FB...Sheduler max allowed fetch weight is " .
+            C\MAX_FETCH_WEIGHT);
         if (!empty($url_tuples)) {
             $url_tuples = array_values($url_tuples);
             $this->crawl_queue->putUrlsFileContents($current_file_name,
diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php
index 08ef35466..e0b7e6e04 100755
--- a/src/library/UrlParser.php
+++ b/src/library/UrlParser.php
@@ -250,6 +250,63 @@ class UrlParser
         }
         return null;
     }
+    /**
+     * Used to guess the file size in bytes of the file that a url is pointed at
+     * based on its file type.
+     *
+     * @param string $url to estimate the size of
+     * @return int estimated number of bytes
+     */
+    public static function guessFileSizeFromUrl($url)
+    {
+        $size_map = [
+            "aac" => 3000000,
+            "asp" => 300000,
+            "aspx" => 300000,
+            "bmp" => 1000000,
+            "bz" => 10000000,
+            "c" => 450000,
+            "cc" => 450000,
+            "cfm" => 300000,
+            "cfml" => 300000,
+            "cgi" => 300000,
+            "cs" => 300000,
+            "css" => 300000,
+            "csv" => 3000000,
+            "doc" => 3000000,
+            "docx" => 3000000,
+            "epub" => 3000000,
+            "gif" => 1000000,
+            "gz" => 10000000,
+            "html" => 300000,
+            "ico" => 1000000,
+            "jpg" => 1000000,
+            "js" => 450000,
+            "jsp" => 300000,
+            "mp3" => 3000000,
+            "mp4" => 50000000,
+            "png" => 1000000,
+            "pdf" => 3000000,
+            "php" => 300000,
+            "pl" => 300000,
+            "ppt" => 3000000,
+            "pptx" => 3000000,
+            "py" => 300000,
+            "rss" => 300000,
+            "rtf" => 3000000,
+            "shtml" => 300000,
+            "svg" => 1000000,
+            "tab" => 3000000,
+            "tsv" => 3000000,
+            "txt" => 300000,
+            "webp" => 1000000,
+            "xlsx" => 3000000,
+            "xlsx" => 3000000,
+            "xml" => 5000000,
+        ];
+        $file_type = UrlParser::getDocumentType($url);
+        return $size_map[$file_type] ?? 1000000;
+    }
     /**
      * Get the host name portion of a url if present; if not return false
      *
@@ -598,7 +655,7 @@ class UrlParser
         $url_parts = @parse_url($url);
         if (!isset($url_parts['path'])) {
             return $default;
-        } else if ($url[strlen($url)-1] == "/" || $url[strlen($url) - 1]
+        } else if ($url[strlen($url) - 1] == "/" || $url[strlen($url) - 1]
             == "\\") {
             return $default;
         } else {

ViewGit