Try to handle case where data upload fails better, a=chris

Chris Pollett [2013-04-12 18:Apr:th]
Try to handle case where data upload fails better, a=chris
Filename
bin/fetcher.php
configs/config.php
lib/fetch_url.php
lib/utility.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 01ce06f44..561e79576 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -2185,6 +2185,17 @@ class Fetcher implements CrawlConstants
                             crawlLog($info[self::SUMMARY]);
                         }
                         crawlLog("Trying again in 5 seconds...");
+                        if($i == 1) {
+                            /* maybe server has limited memory
+                               and two high a post_max_size
+                             */
+                            crawlLog("Using smaller post size to see if helps");
+                            $post_data['force_small'] =true;
+                            $this->post_max_size = 1000000;
+                            $info[self::POST_MAX_SIZE] = 1000000;
+                            /* set to small value before try again.
+                             */
+                        }
                     } else {
                         crawlLog("Trying again in 5 seconds. You might want");
                         crawlLog("to check the queue server url and server");
@@ -2200,7 +2211,8 @@ class Fetcher implements CrawlConstants
                     crawlLog("Messages from Fetch Controller:");
                     crawlLog($info[self::LOGGING]);
                 }
-                if(isset($info[self::POST_MAX_SIZE]) &&
+                if(!isset($post_data['force_small']) &&
+                    isset($info[self::POST_MAX_SIZE]) &&
                     $this->post_max_size != $info[self::POST_MAX_SIZE]) {
                     crawlLog("post_max_size has changed was ".
                         "{$this->post_max_size}. Now is ".
diff --git a/configs/config.php b/configs/config.php
index 1fe02262e..d2ba0ac84 100644
--- a/configs/config.php
+++ b/configs/config.php
@@ -496,7 +496,7 @@ define('DOWNLOAD_TIME_INTERVAL', 0.5);
  * How many non robot urls the fetcher successfully downloads before
  * between times data sent back to queue server
  */
-define ('SEEN_URLS_BEFORE_UPDATE_SCHEDULER', MEMORY_PROFILE * 100);
+define ('SEEN_URLS_BEFORE_UPDATE_SCHEDULER', MEMORY_PROFILE * 95);

 /** maximum number of urls to schedule to a given fetcher in one go */
 define ('MAX_FETCH_SIZE', MEMORY_PROFILE * 1000);
diff --git a/lib/fetch_url.php b/lib/fetch_url.php
index 019b976b0..94796ec82 100755
--- a/lib/fetch_url.php
+++ b/lib/fetch_url.php
@@ -501,7 +501,7 @@ class FetchUrl implements CrawlConstants
     {
         static $agents = array();
         $MAX_SIZE = 50;
-        $host = @parse_url($site,PHP_URL_HOST);
+        $host = @parse_url($site, PHP_URL_HOST);
         if($host !== false) {
             if(count($agents) > $MAX_SIZE) {
                 array_shift($agents);
diff --git a/lib/utility.php b/lib/utility.php
index 96bccb481..d35ccf9f5 100755
--- a/lib/utility.php
+++ b/lib/utility.php
@@ -1082,4 +1082,6 @@ function general_is_a($class_1, $class_2)
     if($class_1 == $class_2) return true;
     return (is_a($class_1, $class_2) || is_subclass_of($class_1, $class_2));
 }
+
+
 ?>
ViewGit