Fix pattern to locate ip in fetch_url, a=chris
Fix pattern to locate ip in fetch_url, a=chris
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 9edceda04..514d48f50 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -1362,7 +1362,6 @@ class Fetcher implements CrawlConstants
$response_code = $site[self::HTTP_CODE];
$was_error = false;
if($response_code < 200 || $response_code >= 300) {
- $was_error = true;
crawlLog($site[self::URL]." response code $response_code");
$host = UrlParser::getHost($site[self::URL]);
if(!isset($this->hosts_with_errors[$host])) {
@@ -1371,6 +1370,7 @@ class Fetcher implements CrawlConstants
if($response_code >= 400 || $response_code < 100) {
// < 100 will capture failures to connect which are returned
// as strings
+ $was_error = true;
$this->hosts_with_errors[$host]++;
}
/* we print out errors to std output. We still go ahead and
diff --git a/configs/config.php b/configs/config.php
index 401a0d5d4..1fe02262e 100644
--- a/configs/config.php
+++ b/configs/config.php
@@ -257,7 +257,7 @@ define('MAX_ARCHIVE_OBJECT_SIZE', 100000000);
/**
* Code to determine how much memory current machine has
*/
-$memory = 4000000000; //assume have at least 4GB on a Mac nowadays
+$memory = 4000000000; //assume have at least 4GB on a Mac (could use vm_stat)
if(strstr(PHP_OS, "WIN")) {
exec('wmic memorychip get capacity', $memory_array);
$memory = array_sum($memory_array);
diff --git a/lib/fetch_url.php b/lib/fetch_url.php
index 99d9f91ec..6aa33807c 100755
--- a/lib/fetch_url.php
+++ b/lib/fetch_url.php
@@ -101,6 +101,8 @@ class FetchUrl implements CrawlConstants
curl_setopt($sites[$i][0], CURLOPT_VERBOSE, true);
}
curl_setopt($sites[$i][0], CURLOPT_USERAGENT, USER_AGENT);
+ curl_setopt($sites[$i][0], CURLOPT_IPRESOLVE,
+ CURL_IPRESOLVE_V4);
curl_setopt($sites[$i][0], CURLOPT_URL, $url);
if(strcmp(substr($url,-10), "robots.txt") == 0 ) {
$follow = true; /*wikipedia redirects their robot page. grr
@@ -467,7 +469,7 @@ class FetchUrl implements CrawlConstants
*/
static function getCurlIp($header)
{
- if (preg_match_all('/Trying\s+(.*)\b/',
+ if (preg_match_all('/Trying\s+(.*)(\b|\.\.\.)/',
$header, $matches)) {
$out_addresses = array();
$addresses = array_unique($matches[1]);