First pass at Issue 78, a=chris

Chris Pollett [2012-03-05 05:Mar:th]
First pass at Issue 78, a=chris
Filename
bin/fetcher.php
bin/queue_server.php
configs/config.php
controllers/statistics_controller.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 53d05928b..117782879 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -288,7 +288,13 @@ class Fetcher implements CrawlConstants
      * Which fetcher instance we are (if fetcher run as a job and more that one)
      * @var string
      */
-     var $fetcher_num;
+    var $fetcher_num;
+
+    /**
+     * An array to keep track of hosts which have had a lot of http errors
+     * @var array
+     */
+    var $hosts_with_errors;

     /**
      * Sets up the field variables so that crawling can begin
@@ -310,6 +316,7 @@ class Fetcher implements CrawlConstants
         $this->current_server = 0;
         $this->page_processors = $page_processors;
         $this->meta_words = array();
+        $this->hosts_with_errors = array();

         $this->web_archive = NULL;
         $this->crawl_time = NULL;
@@ -729,7 +736,7 @@ class Fetcher implements CrawlConstants
      *
      * @return mixed array or bool. If we are doing
      *      a web crawl and we still have pages to crawl then true, if the
-     *      schedulaer page fails to download then false, otherwise, returns
+     *      scheduler page fails to download then false, otherwise, returns
      *      an array of info from the scheduler.
      */
     function checkScheduler()
@@ -751,6 +758,9 @@ class Fetcher implements CrawlConstants
         $this->recrawl_check_scheduler = false;
         $queue_server = $this->queue_servers[$this->current_server];

+        // hosts with error counts cleared with each schedule
+        $this->hosts_with_errors = array();
+
         $start_time = microtime();
         $time = time();
         $session = md5($time . AUTH_KEY);
@@ -892,22 +902,27 @@ class Fetcher implements CrawlConstants

             $delete_indices[] = $site_pair['key'];
             if($site_pair['value'][0] != self::DUMMY) {
-                $seeds[$i][self::URL] = $site_pair['value'][0];
-                $seeds[$i][self::WEIGHT] = $site_pair['value'][1];
-                $seeds[$i][self::CRAWL_DELAY] = $site_pair['value'][2];
-                /*
-                  Crawl delay is only used in scheduling on the queue_server
-                  on the fetcher, we only use crawl-delay to determine
-                  if we will give a page a second try if it doesn't
-                  download the first time
-                */
-
-                if(UrlParser::getDocumentFilename($seeds[$i][self::URL]).
-                    ".".UrlParser::getDocumentType($seeds[$i][self::URL])
-                    == "robots.txt") {
-                    $seeds[$i][self::ROBOT_PATHS] = array();
+                $host = UrlParser::getHost($site_pair['value'][0]);
+                // only download if host doesn't seem congested
+                if(!isset($this->hosts_with_errors[$host]) ||
+                    $this->hosts_with_errors[$host] < DOWNLOAD_ERROR_THRESHOLD){
+                    $seeds[$i][self::URL] = $site_pair['value'][0];
+                    $seeds[$i][self::WEIGHT] = $site_pair['value'][1];
+                    $seeds[$i][self::CRAWL_DELAY] = $site_pair['value'][2];
+                    /*
+                      Crawl delay is only used in scheduling on the queue_server
+                      on the fetcher, we only use crawl-delay to determine
+                      if we will give a page a second try if it doesn't
+                      download the first time
+                    */
+
+                    if(UrlParser::getDocumentFilename($seeds[$i][self::URL]).
+                        ".".UrlParser::getDocumentType($seeds[$i][self::URL])
+                        == "robots.txt") {
+                        $seeds[$i][self::ROBOT_PATHS] = array();
+                    }
+                    $i++;
                 }
-                $i++;
             } else {
                 break;
             }
@@ -1015,7 +1030,11 @@ class Fetcher implements CrawlConstants

             if($response_code < 200 || $response_code >= 300) {
                 crawlLog($site[self::URL]." response code $response_code");
-
+                $host = UrlParser::getHost($site[self::URL]);
+                if(!isset($this->hosts_with_errors[$host])) {
+                    $this->hosts_with_errors[$host] = 0;
+                }
+                $this->hosts_with_errors[$host]++;
                 /* we print out errors to std output. We still go ahead and
                    process the page. Maybe it is a cool error page, also
                    this makes sure we don't crawl it again
@@ -1344,8 +1363,8 @@ class Fetcher implements CrawlConstants

         for($i = 0; $i < count($sites); $i++) {
             $site = $sites[$i];
+            $host = UrlParser::getHost($site[self::URL]);
             if(isset($site[self::ROBOT_PATHS])) {
-                $host = UrlParser::getHost($site[self::URL]);
                 $this->found_sites[self::ROBOT_TXT][$host][self::IP_ADDRESSES] =
                     $site[self::IP_ADDRESSES];
                 $this->found_sites[self::ROBOT_TXT][$host][self::PATHS] =
@@ -1378,6 +1397,13 @@ class Fetcher implements CrawlConstants
                 }
             } //end else

+            if(isset($this->hosts_with_errors[$host]) &&
+                $this->hosts_with_errors[$host] > DOWNLOAD_ERROR_THRESHOLD) {
+                $this->found_sites[self::ROBOT_TXT][$host][
+                    self::CRAWL_DELAY] = ERROR_CRAWL_DELAY;
+                echo "setting crawl delay $host";
+            }
+
             if(isset($this->found_sites[self::TO_CRAWL])) {
                 $this->found_sites[self::TO_CRAWL] =
                     array_filter($this->found_sites[self::TO_CRAWL]);
@@ -1510,7 +1536,7 @@ class Fetcher implements CrawlConstants
     function getCompanyLevelDomain($url)
     {
         $subdomains = UrlParser::getHostSubdomains($url);
-        if(!isset($subdomains[0])) return "";
+        if(!isset($subdomains[0]) || !isset($subdomains[2])) return "";
         /*
             if $url is www.yahoo.com
                 $subdomains[0] == com, $subdomains[1] == .com,
diff --git a/bin/queue_server.php b/bin/queue_server.php
index 1b4467ede..a440152c4 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -1490,8 +1490,8 @@ class QueueServer implements CrawlConstants, Join
             $this->web_queue->emptyDNSCache();
         } else {
             crawlLog("... less than max age\n");
-            crawlLog("Number of Crawl-Delayed Hosts: ".count(
-                $this->waiting_hosts));
+            crawlLog("Number of Crawl-Delayed Hosts: ".floor(count(
+                $this->waiting_hosts)/2));
         }

         crawlLog("Checking for robots.txt files to process...");
@@ -1965,7 +1965,8 @@ class QueueServer implements CrawlConstants, Join
                     continue;
                 }

-                $num_waiting = count($this->waiting_hosts);
+                //each host has two entries in $this->waiting_hosts
+                $num_waiting = floor(count($this->waiting_hosts)/2);

                 if($delay > 0 ) {
                     // handle adding a url if there is a crawl delay
diff --git a/configs/config.php b/configs/config.php
index 1da2ce986..f5eb532c5 100755
--- a/configs/config.php
+++ b/configs/config.php
@@ -252,7 +252,17 @@ define('MAX_PHRASE_LEN', 2);
 define('NUM_MULTI_CURL_PAGES', 100);

 /** time in seconds before we give up on a page */
-define('PAGE_TIMEOUT', 30);
+define('PAGE_TIMEOUT', 30);
+
+/**
+ *  Number of error page 400 or greater seen from a host before crawl-delay
+ *  host and dump remainder from current schedule
+ */
+define('DOWNLOAD_ERROR_THRESHOLD', 10);
+
+/** Crawl-delay to set in the event that DOWNLOAD_ERROR_THRESHOLD exceeded*/
+define('ERROR_CRAWL_DELAY', 20);
+

 /** how often should we make in OPIC the sum of weights totals MAX_URLS */
 define('NORMALIZE_FREQUENCY', 10000);
@@ -332,7 +342,7 @@ $PAGE_PROCESSORS = array(   "text/html" => "HtmlProcessor",
                             "image/svg+xml"=> "SvgProcessor"
 );

-if(defined('PHP_VERSION_ID') && PHP_VERSION_ID > 50300) {
+if(defined('PHP_VERSION_ID') && PHP_VERSION_ID >= 50300) {
     $INDEXING_PLUGINS = array("recipe");
 } else {
     $INDEXING_PLUGINS = array();
diff --git a/controllers/statistics_controller.php b/controllers/statistics_controller.php
index c25293bfe..7241ba3b4 100644
--- a/controllers/statistics_controller.php
+++ b/controllers/statistics_controller.php
@@ -195,10 +195,11 @@ class StatisticsController extends Controller implements CrawlConstants
                 "freebsd", "gentoo", "linux", "netware", "solaris", "sunos",
                 "ubuntu", "unix"),
             "SERVER" => array("aolserver", "apache", "bigip", "boa", "caudium",
-                "cherokee", "gws", "httpd", "iis", "ibm_http_server",
-                "jetty", "lighttpd", "litespeed", "microsoft-iis", "nginx",
-                "resin", "sun-java-system", "thttpd", "tux",
-                "virtuoso", "webrick", "yaws", "yts", "zeus", "zope"),
+                "cherokee", "gws", "goahead-webs", "httpd", "iis",
+                "ibm_http_server", "jetty", "lighttpd", "litespeed",
+                "microsoft-iis", "nginx", "resin", "sun-java-system",
+                "thttpd", "tux", "virtuoso", "webrick", "yaws", "yts",
+                "zeus", "zope"),
             "SITE" => array(".aero", ".asia", ".biz", ".cat", ".com", ".coop",
                 ".edu", ".gov", ".info", ".int", ".jobs", ".mil", ".mobi",
                 ".museum", ".name", ".net", ".org", ".pro", ".tel", ".travel",
ViewGit