Fix pattern to locate ip in fetch_url, a=chris

Chris Pollett [2013-04-12 00:Apr:th]
Fix pattern to locate ip in fetch_url, a=chris
Filename
bin/fetcher.php
configs/config.php
lib/fetch_url.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 9edceda04..514d48f50 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -1362,7 +1362,6 @@ class Fetcher implements CrawlConstants
             $response_code = $site[self::HTTP_CODE];
             $was_error = false;
             if($response_code < 200 || $response_code >= 300) {
-                $was_error = true;
                 crawlLog($site[self::URL]." response code $response_code");
                 $host = UrlParser::getHost($site[self::URL]);
                 if(!isset($this->hosts_with_errors[$host])) {
@@ -1371,6 +1370,7 @@ class Fetcher implements CrawlConstants
                 if($response_code >= 400 || $response_code < 100) {
                     // < 100 will capture failures to connect which are returned
                     // as strings
+                    $was_error = true;
                     $this->hosts_with_errors[$host]++;
                 }
                 /* we print out errors to std output. We still go ahead and
diff --git a/configs/config.php b/configs/config.php
index 401a0d5d4..1fe02262e 100644
--- a/configs/config.php
+++ b/configs/config.php
@@ -257,7 +257,7 @@ define('MAX_ARCHIVE_OBJECT_SIZE', 100000000);
 /**
  * Code to determine how much memory current machine has
  */
-$memory = 4000000000; //assume have at least 4GB on a Mac nowadays
+$memory = 4000000000; //assume have at least 4GB on a Mac (could use vm_stat)
 if(strstr(PHP_OS, "WIN")) {
     exec('wmic memorychip get capacity', $memory_array);
     $memory = array_sum($memory_array);
diff --git a/lib/fetch_url.php b/lib/fetch_url.php
index 99d9f91ec..6aa33807c 100755
--- a/lib/fetch_url.php
+++ b/lib/fetch_url.php
@@ -101,6 +101,8 @@ class FetchUrl implements CrawlConstants
                     curl_setopt($sites[$i][0], CURLOPT_VERBOSE, true);
                 }
                 curl_setopt($sites[$i][0], CURLOPT_USERAGENT, USER_AGENT);
+                curl_setopt($sites[$i][0], CURLOPT_IPRESOLVE,
+                    CURL_IPRESOLVE_V4);
                 curl_setopt($sites[$i][0], CURLOPT_URL, $url);
                 if(strcmp(substr($url,-10), "robots.txt") == 0 ) {
                     $follow = true; /*wikipedia redirects their robot page. grr
@@ -467,7 +469,7 @@ class FetchUrl implements CrawlConstants
      */
     static function getCurlIp($header)
     {
-        if (preg_match_all('/Trying\s+(.*)\b/',
+        if (preg_match_all('/Trying\s+(.*)(\b|\.\.\.)/',
             $header, $matches)) {
             $out_addresses = array();
             $addresses = array_unique($matches[1]);
ViewGit