Increased MAX_FETCH_SIZE
diff --git a/src/configs/Config.php b/src/configs/Config.php
index 8568ddc85..e6b48cb2d 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -1064,8 +1064,11 @@ nsconddefine('DOWNLOAD_SIZE_INTERVAL', 5000);
nsconddefine('DOWNLOAD_TIME_INTERVAL', 0.5);
/** maximum number of urls to schedule to a given fetcher in one go
* Fetcher needs enough memory to hold all these files in memory.
+ * So took the FETCHER_MEMORY_LIMIT divided by 16000000 (roughly
+ * twice the average size of an image on the web) and worked this out
+ * as multiple of MEMORY_PROFILE
*/
-nsconddefine('MAX_FETCH_SIZE', MEMORY_PROFILE * 195);
+nsconddefine('MAX_FETCH_SIZE', ceil(MEMORY_PROFILE * 416));
/**
* maximum number url queue files to process in trying to create a
* fetch batch from a tier queue
diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php
index 32f4ee554..e4729008e 100644
--- a/src/controllers/components/CrawlComponent.php
+++ b/src/controllers/components/CrawlComponent.php
@@ -1887,7 +1887,7 @@ class CrawlComponent extends Component implements CrawlConstants
$site_pages = FetchUrl::getPages($sites, true,
$data["PAGE_RANGE_REQUEST"], C\TEMP_DIR);
$site = $site_pages[0];
- if (!empty($site[self::HTTP_CODE]) && empty($site[self::TYPE]) &&
+ if (!empty($site[self::HTTP_CODE]) &&
$site[self::HTTP_CODE]>= 300 && $site[self::HTTP_CODE] <= 400) {
$site[self::TYPE] = "text/plain";
}
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 53f74e31e..f6852eb9d 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -2465,7 +2465,10 @@ class Fetcher implements CrawlConstants
$site[self::ROBOT_PATHS][self::DISALLOWED_SITES] =
["/"];
}
- //set same robots.txt for all redirects went through
+ /* set same robots.txt for all redirects went through
+ FetchURL::getPages always follows redirects for
+ robots.txt files
+ */
$locations = [$host];
if (!empty($site[self::LOCATION]) &&
is_array($site[self::LOCATION])) {