diff --git a/bin/fetcher.php b/bin/fetcher.php
index 53d05928b..117782879 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -288,7 +288,13 @@ class Fetcher implements CrawlConstants
* Which fetcher instance we are (if fetcher run as a job and more that one)
* @var string
*/
- var $fetcher_num;
+ var $fetcher_num;
+
+ /**
+ * An array to keep track of hosts which have had a lot of http errors
+ * @var array
+ */
+ var $hosts_with_errors;
/**
* Sets up the field variables so that crawling can begin
@@ -310,6 +316,7 @@ class Fetcher implements CrawlConstants
$this->current_server = 0;
$this->page_processors = $page_processors;
$this->meta_words = array();
+ $this->hosts_with_errors = array();
$this->web_archive = NULL;
$this->crawl_time = NULL;
@@ -729,7 +736,7 @@ class Fetcher implements CrawlConstants
*
* @return mixed array or bool. If we are doing
* a web crawl and we still have pages to crawl then true, if the
- * schedulaer page fails to download then false, otherwise, returns
+ * scheduler page fails to download then false, otherwise, returns
* an array of info from the scheduler.
*/
function checkScheduler()
@@ -751,6 +758,9 @@ class Fetcher implements CrawlConstants
$this->recrawl_check_scheduler = false;
$queue_server = $this->queue_servers[$this->current_server];
+ // hosts with error counts cleared with each schedule
+ $this->hosts_with_errors = array();
+
$start_time = microtime();
$time = time();
$session = md5($time . AUTH_KEY);
@@ -892,22 +902,27 @@ class Fetcher implements CrawlConstants
$delete_indices[] = $site_pair['key'];
if($site_pair['value'][0] != self::DUMMY) {
- $seeds[$i][self::URL] = $site_pair['value'][0];
- $seeds[$i][self::WEIGHT] = $site_pair['value'][1];
- $seeds[$i][self::CRAWL_DELAY] = $site_pair['value'][2];
- /*
- Crawl delay is only used in scheduling on the queue_server
- on the fetcher, we only use crawl-delay to determine
- if we will give a page a second try if it doesn't
- download the first time
- */
-
- if(UrlParser::getDocumentFilename($seeds[$i][self::URL]).
- ".".UrlParser::getDocumentType($seeds[$i][self::URL])
- == "robots.txt") {
- $seeds[$i][self::ROBOT_PATHS] = array();
+ $host = UrlParser::getHost($site_pair['value'][0]);
+ // only download if host doesn't seem congested
+ if(!isset($this->hosts_with_errors[$host]) ||
+ $this->hosts_with_errors[$host] < DOWNLOAD_ERROR_THRESHOLD){
+ $seeds[$i][self::URL] = $site_pair['value'][0];
+ $seeds[$i][self::WEIGHT] = $site_pair['value'][1];
+ $seeds[$i][self::CRAWL_DELAY] = $site_pair['value'][2];
+ /*
+ Crawl delay is only used in scheduling on the queue_server
+ on the fetcher, we only use crawl-delay to determine
+ if we will give a page a second try if it doesn't
+ download the first time
+ */
+
+ if(UrlParser::getDocumentFilename($seeds[$i][self::URL]).
+ ".".UrlParser::getDocumentType($seeds[$i][self::URL])
+ == "robots.txt") {
+ $seeds[$i][self::ROBOT_PATHS] = array();
+ }
+ $i++;
}
- $i++;
} else {
break;
}
@@ -1015,7 +1030,11 @@ class Fetcher implements CrawlConstants
if($response_code < 200 || $response_code >= 300) {
crawlLog($site[self::URL]." response code $response_code");
-
+ $host = UrlParser::getHost($site[self::URL]);
+ if(!isset($this->hosts_with_errors[$host])) {
+ $this->hosts_with_errors[$host] = 0;
+ }
+ $this->hosts_with_errors[$host]++;
/* we print out errors to std output. We still go ahead and
process the page. Maybe it is a cool error page, also
this makes sure we don't crawl it again
@@ -1344,8 +1363,8 @@ class Fetcher implements CrawlConstants
for($i = 0; $i < count($sites); $i++) {
$site = $sites[$i];
+ $host = UrlParser::getHost($site[self::URL]);
if(isset($site[self::ROBOT_PATHS])) {
- $host = UrlParser::getHost($site[self::URL]);
$this->found_sites[self::ROBOT_TXT][$host][self::IP_ADDRESSES] =
$site[self::IP_ADDRESSES];
$this->found_sites[self::ROBOT_TXT][$host][self::PATHS] =
@@ -1378,6 +1397,13 @@ class Fetcher implements CrawlConstants
}
} //end else
+ if(isset($this->hosts_with_errors[$host]) &&
+ $this->hosts_with_errors[$host] > DOWNLOAD_ERROR_THRESHOLD) {
+ $this->found_sites[self::ROBOT_TXT][$host][
+ self::CRAWL_DELAY] = ERROR_CRAWL_DELAY;
+ echo "setting crawl delay $host";
+ }
+
if(isset($this->found_sites[self::TO_CRAWL])) {
$this->found_sites[self::TO_CRAWL] =
array_filter($this->found_sites[self::TO_CRAWL]);
@@ -1510,7 +1536,7 @@ class Fetcher implements CrawlConstants
function getCompanyLevelDomain($url)
{
$subdomains = UrlParser::getHostSubdomains($url);
- if(!isset($subdomains[0])) return "";
+ if(!isset($subdomains[0]) || !isset($subdomains[2])) return "";
/*
if $url is www.yahoo.com
$subdomains[0] == com, $subdomains[1] == .com,
diff --git a/bin/queue_server.php b/bin/queue_server.php
index 1b4467ede..a440152c4 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -1490,8 +1490,8 @@ class QueueServer implements CrawlConstants, Join
$this->web_queue->emptyDNSCache();
} else {
crawlLog("... less than max age\n");
- crawlLog("Number of Crawl-Delayed Hosts: ".count(
- $this->waiting_hosts));
+ crawlLog("Number of Crawl-Delayed Hosts: ".floor(count(
+ $this->waiting_hosts)/2));
}
crawlLog("Checking for robots.txt files to process...");
@@ -1965,7 +1965,8 @@ class QueueServer implements CrawlConstants, Join
continue;
}
- $num_waiting = count($this->waiting_hosts);
+ //each host has two entries in $this->waiting_hosts
+ $num_waiting = floor(count($this->waiting_hosts)/2);
if($delay > 0 ) {
// handle adding a url if there is a crawl delay
diff --git a/configs/config.php b/configs/config.php
index 1da2ce986..f5eb532c5 100755
--- a/configs/config.php
+++ b/configs/config.php
@@ -252,7 +252,17 @@ define('MAX_PHRASE_LEN', 2);
define('NUM_MULTI_CURL_PAGES', 100);
/** time in seconds before we give up on a page */
-define('PAGE_TIMEOUT', 30);
+define('PAGE_TIMEOUT', 30);
+
+/**
+ * Number of error page 400 or greater seen from a host before crawl-delay
+ * host and dump remainder from current schedule
+ */
+define('DOWNLOAD_ERROR_THRESHOLD', 10);
+
+/** Crawl-delay to set in the event that DOWNLOAD_ERROR_THRESHOLD exceeded*/
+define('ERROR_CRAWL_DELAY', 20);
+
/** how often should we make in OPIC the sum of weights totals MAX_URLS */
define('NORMALIZE_FREQUENCY', 10000);
@@ -332,7 +342,7 @@ $PAGE_PROCESSORS = array( "text/html" => "HtmlProcessor",
"image/svg+xml"=> "SvgProcessor"
);
-if(defined('PHP_VERSION_ID') && PHP_VERSION_ID > 50300) {
+if(defined('PHP_VERSION_ID') && PHP_VERSION_ID >= 50300) {
$INDEXING_PLUGINS = array("recipe");
} else {
$INDEXING_PLUGINS = array();
diff --git a/controllers/statistics_controller.php b/controllers/statistics_controller.php
index c25293bfe..7241ba3b4 100644
--- a/controllers/statistics_controller.php
+++ b/controllers/statistics_controller.php
@@ -195,10 +195,11 @@ class StatisticsController extends Controller implements CrawlConstants
"freebsd", "gentoo", "linux", "netware", "solaris", "sunos",
"ubuntu", "unix"),
"SERVER" => array("aolserver", "apache", "bigip", "boa", "caudium",
- "cherokee", "gws", "httpd", "iis", "ibm_http_server",
- "jetty", "lighttpd", "litespeed", "microsoft-iis", "nginx",
- "resin", "sun-java-system", "thttpd", "tux",
- "virtuoso", "webrick", "yaws", "yts", "zeus", "zope"),
+ "cherokee", "gws", "goahead-webs", "httpd", "iis",
+ "ibm_http_server", "jetty", "lighttpd", "litespeed",
+ "microsoft-iis", "nginx", "resin", "sun-java-system",
+ "thttpd", "tux", "virtuoso", "webrick", "yaws", "yts",
+ "zeus", "zope"),
"SITE" => array(".aero", ".asia", ".biz", ".cat", ".com", ".coop",
".edu", ".gov", ".info", ".int", ".jobs", ".mil", ".mobi",
".museum", ".name", ".net", ".org", ".pro", ".tel", ".travel",