Increased MAX_FETCH_SIZE

Chris Pollett [2023-09-03 04:Sep:rd]

Increased MAX_FETCH_SIZE

Filename
src/configs/Config.php
src/controllers/components/CrawlComponent.php
src/executables/Fetcher.php

diff --git a/src/configs/Config.php b/src/configs/Config.php
index 8568ddc85..e6b48cb2d 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -1064,8 +1064,11 @@ nsconddefine('DOWNLOAD_SIZE_INTERVAL', 5000);
 nsconddefine('DOWNLOAD_TIME_INTERVAL', 0.5);
 /** maximum number of urls to schedule to a given fetcher in one go
  *  Fetcher needs enough memory to hold all these files in memory.
+ *  So took the FETCHER_MEMORY_LIMIT divided by 16000000 (roughly
+ *  twice the average size of an image on the web) and worked this out
+ *  as multiple of MEMORY_PROFILE
  */
-nsconddefine('MAX_FETCH_SIZE', MEMORY_PROFILE * 195);
+nsconddefine('MAX_FETCH_SIZE', ceil(MEMORY_PROFILE * 416));
 /**
  * maximum number url queue files to process in trying to create a
  * fetch batch from a tier queue
diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php
index 32f4ee554..e4729008e 100644
--- a/src/controllers/components/CrawlComponent.php
+++ b/src/controllers/components/CrawlComponent.php
@@ -1887,7 +1887,7 @@ class CrawlComponent extends Component implements CrawlConstants
             $site_pages = FetchUrl::getPages($sites, true,
                 $data["PAGE_RANGE_REQUEST"], C\TEMP_DIR);
             $site = $site_pages[0];
-            if (!empty($site[self::HTTP_CODE]) && empty($site[self::TYPE]) &&
+            if (!empty($site[self::HTTP_CODE])  &&
                 $site[self::HTTP_CODE]>= 300 && $site[self::HTTP_CODE] <= 400) {
                 $site[self::TYPE] = "text/plain";
             }
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 53f74e31e..f6852eb9d 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -2465,7 +2465,10 @@ class Fetcher implements CrawlConstants
                     $site[self::ROBOT_PATHS][self::DISALLOWED_SITES] =
                         ["/"];
                 }
-                //set same robots.txt for all redirects went through
+                /* set same robots.txt for all redirects went through
+                   FetchURL::getPages always follows redirects for
+                   robots.txt files
+                 */
                 $locations = [$host];
                 if (!empty($site[self::LOCATION]) &&
                     is_array($site[self::LOCATION])) {

ViewGit