diff --git a/src/configs/Config.php b/src/configs/Config.php index e6b48cb2d..e583469ee 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -1069,6 +1069,15 @@ nsconddefine('DOWNLOAD_TIME_INTERVAL', 0.5); * as multiple of MEMORY_PROFILE */ nsconddefine('MAX_FETCH_SIZE', ceil(MEMORY_PROFILE * 416)); +/** + * A second way to ensure that not too much memory is consumed in the + * fetcher by downloaded web pages is to try to estimate on the high side + * the number bytes each url document is like to consume. This is done in + * UrlParser::guessFileSizeFromUrl($url). One can sum the estimated bytes for + * the urls in a fetch batch and check that it is less than MAX_FETCH_WEIGHT + * below. (Set to 0.25 of the Fetcher's memory size) + */ +nsconddefine('MAX_FETCH_WEIGHT', MEMORY_PROFILE * 166666666); /** * maximum number url queue files to process in trying to create a * fetch batch from a tier queue diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index 76e2c3ac0..6fa62ba23 100644 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -2267,6 +2267,7 @@ class QueueServer implements CrawlConstants public function produceFetchBatch() { $fetch_size = 0; + $fetch_weight = 0; L\crawlLog("FB Scheduler: Start Produce Fetch Batch."); L\crawlLog("FB Crawl Time is: ". $this->crawl_time); L\crawlLog("FB Memory usage is " . memory_get_usage() ); @@ -2308,6 +2309,7 @@ class QueueServer implements CrawlConstants $examined_count = 0; $num_files_processed = 0; while ($fetch_size < C\MAX_FETCH_SIZE && + $fetch_weight < C\MAX_FETCH_WEIGHT && $num_files_processed < C\MAX_FILES_PROCESS_BATCH) { while (empty($url_tuples) && !empty($day_folders)) { if (empty($url_files)) { @@ -2378,6 +2380,7 @@ class QueueServer implements CrawlConstants $sites[$next_slot] = [$url, $weight, $depth, 0, $referer]; $current_crawl_index++; $fetch_size++; + $fetch_weight += UrlParser::guessFileSizeFromUrl($url); } else { //no more available slots so prepare to bail $url_tuples[$tuple_index] = $current_tuple; break; @@ -2469,6 +2472,7 @@ class QueueServer implements CrawlConstants $sites[$next_slot] = [$url, $weight, $depth, $delay, $referer]; $fetch_size++; + $fetch_weight += UrlParser::guessFileSizeFromUrl($url); } else { $reschedule_tuples[] = $current_tuple; if (count($reschedule_tuples) > $max_buffer_before_write) { @@ -2495,12 +2499,17 @@ class QueueServer implements CrawlConstants $sites[$next_slot] = [$url, $weight, $depth, 0, $referer]; $current_crawl_index++; $fetch_size++; + $fetch_weight += UrlParser::guessFileSizeFromUrl($url); } else { //no more available slots so prepare to bail $url_tuples[$tuple_index] = $current_tuple; break; } } //no crawl-delay else } //end while + L\crawlLog("FB...Sheduler Final estimated fetch weight of selected ". + "urls $fetch_weight"); + L\crawlLog("FB...Sheduler max allowed fetch weight is " . + C\MAX_FETCH_WEIGHT); if (!empty($url_tuples)) { $url_tuples = array_values($url_tuples); $this->crawl_queue->putUrlsFileContents($current_file_name, diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php index 08ef35466..e0b7e6e04 100755 --- a/src/library/UrlParser.php +++ b/src/library/UrlParser.php @@ -250,6 +250,63 @@ class UrlParser } return null; } + /** + * Used to guess the file size in bytes of the file that a url is pointed at + * based on its file type. + * + * @param string $url to estimate the size of + * @return int estimated number of bytes + */ + public static function guessFileSizeFromUrl($url) + { + $size_map = [ + "aac" => 3000000, + "asp" => 300000, + "aspx" => 300000, + "bmp" => 1000000, + "bz" => 10000000, + "c" => 450000, + "cc" => 450000, + "cfm" => 300000, + "cfml" => 300000, + "cgi" => 300000, + "cs" => 300000, + "css" => 300000, + "csv" => 3000000, + "doc" => 3000000, + "docx" => 3000000, + "epub" => 3000000, + "gif" => 1000000, + "gz" => 10000000, + "html" => 300000, + "ico" => 1000000, + "jpg" => 1000000, + "js" => 450000, + "jsp" => 300000, + "mp3" => 3000000, + "mp4" => 50000000, + "png" => 1000000, + "pdf" => 3000000, + "php" => 300000, + "pl" => 300000, + "ppt" => 3000000, + "pptx" => 3000000, + "py" => 300000, + "rss" => 300000, + "rtf" => 3000000, + "shtml" => 300000, + "svg" => 1000000, + "tab" => 3000000, + "tsv" => 3000000, + "txt" => 300000, + "webp" => 1000000, + "xlsx" => 3000000, + "xlsx" => 3000000, + "xml" => 5000000, + ]; + $file_type = UrlParser::getDocumentType($url); + return $size_map[$file_type] ?? 1000000; + } /** * Get the host name portion of a url if present; if not return false * @@ -598,7 +655,7 @@ class UrlParser $url_parts = @parse_url($url); if (!isset($url_parts['path'])) { return $default; - } else if ($url[strlen($url)-1] == "/" || $url[strlen($url) - 1] + } else if ($url[strlen($url) - 1] == "/" || $url[strlen($url) - 1] == "\\") { return $default; } else {