Try to handle case where data upload fails better, a=chris
Try to handle case where data upload fails better, a=chris
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 01ce06f44..561e79576 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -2185,6 +2185,17 @@ class Fetcher implements CrawlConstants
crawlLog($info[self::SUMMARY]);
}
crawlLog("Trying again in 5 seconds...");
+ if($i == 1) {
+ /* maybe server has limited memory
+ and two high a post_max_size
+ */
+ crawlLog("Using smaller post size to see if helps");
+ $post_data['force_small'] =true;
+ $this->post_max_size = 1000000;
+ $info[self::POST_MAX_SIZE] = 1000000;
+ /* set to small value before try again.
+ */
+ }
} else {
crawlLog("Trying again in 5 seconds. You might want");
crawlLog("to check the queue server url and server");
@@ -2200,7 +2211,8 @@ class Fetcher implements CrawlConstants
crawlLog("Messages from Fetch Controller:");
crawlLog($info[self::LOGGING]);
}
- if(isset($info[self::POST_MAX_SIZE]) &&
+ if(!isset($post_data['force_small']) &&
+ isset($info[self::POST_MAX_SIZE]) &&
$this->post_max_size != $info[self::POST_MAX_SIZE]) {
crawlLog("post_max_size has changed was ".
"{$this->post_max_size}. Now is ".
diff --git a/configs/config.php b/configs/config.php
index 1fe02262e..d2ba0ac84 100644
--- a/configs/config.php
+++ b/configs/config.php
@@ -496,7 +496,7 @@ define('DOWNLOAD_TIME_INTERVAL', 0.5);
* How many non robot urls the fetcher successfully downloads before
* between times data sent back to queue server
*/
-define ('SEEN_URLS_BEFORE_UPDATE_SCHEDULER', MEMORY_PROFILE * 100);
+define ('SEEN_URLS_BEFORE_UPDATE_SCHEDULER', MEMORY_PROFILE * 95);
/** maximum number of urls to schedule to a given fetcher in one go */
define ('MAX_FETCH_SIZE', MEMORY_PROFILE * 1000);
diff --git a/lib/fetch_url.php b/lib/fetch_url.php
index 019b976b0..94796ec82 100755
--- a/lib/fetch_url.php
+++ b/lib/fetch_url.php
@@ -501,7 +501,7 @@ class FetchUrl implements CrawlConstants
{
static $agents = array();
$MAX_SIZE = 50;
- $host = @parse_url($site,PHP_URL_HOST);
+ $host = @parse_url($site, PHP_URL_HOST);
if($host !== false) {
if(count($agents) > $MAX_SIZE) {
array_shift($agents);
diff --git a/lib/utility.php b/lib/utility.php
index 96bccb481..d35ccf9f5 100755
--- a/lib/utility.php
+++ b/lib/utility.php
@@ -1082,4 +1082,6 @@ function general_is_a($class_1, $class_2)
if($class_1 == $class_2) return true;
return (is_a($class_1, $class_2) || is_subclass_of($class_1, $class_2));
}
+
+
?>