try to prevent malformed images from crashing fetcher, a=chris
try to prevent malformed images from crashing fetcher, a=chris
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 1d67444ea..2755368b5 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -1782,7 +1782,16 @@ class Fetcher implements CrawlConstants
"HtmlProcessor")) {
$processor->scrapers = $this->scrapers;
}
- $doc_info = $processor->handle($site[self::PAGE],
+ $page = $site[self::PAGE];
+ if (L\generalIsA($page_processor, C\NS_PROCESSORS.
+ "ImageProcessor")) {
+ if (!empty($site[self::CONTENT_SIZE]) &&
+ !empty($site[self::SIZE]) && $site[self::CONTENT_SIZE] >
+ $site[self::SIZE]) {
+ $page = "";
+ }
+ }
+ $doc_info = $processor->handle($page,
$site[self::URL]);
if (C\FETCHER_PROCESS_DELAY > 0 ) {
usleep(C\FETCHER_PROCESS_DELAY);
@@ -2898,4 +2907,4 @@ class Fetcher implements CrawlConstants
* Instantiate and runs the Fetcher
*/
$fetcher = new Fetcher();
-$fetcher->start();
\ No newline at end of file
+$fetcher->start();
diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php
index a0bb01eeb..60bab64bd 100755
--- a/src/library/CrawlConstants.php
+++ b/src/library/CrawlConstants.php
@@ -233,6 +233,6 @@ interface CrawlConstants
const SCRAPERS = 'dv';
const IS_NEWS = "dw";
const QUESTION_ANSWERS = 'dx';
- const CONTENT_LENGTH = 'dy';
+ const CONTENT_SIZE = 'dy';
const NO_RANGE = 'dz';
}
diff --git a/src/library/FetchUrl.php b/src/library/FetchUrl.php
index ca5339fd8..9f439912e 100755
--- a/src/library/FetchUrl.php
+++ b/src/library/FetchUrl.php
@@ -264,9 +264,6 @@ class FetchUrl implements CrawlConstants
if (!$minimal) {
$sites[$i][self::SIZE] = @curl_getinfo($sites[$i][0],
CURLINFO_SIZE_DOWNLOAD);
- $site[$i][self::CONTENT_LENGTH] =
- @curl_getinfo($sites[$i][0],
- CURLINFO_CONTENT_LENGTH_DOWNLOAD);
$sites[$i][self::DNS_TIME] = @curl_getinfo($sites[$i][0],
CURLINFO_NAMELOOKUP_TIME);
$sites[$i][self::TOTAL_TIME] = @curl_getinfo($sites[$i][0],
@@ -409,7 +406,6 @@ class FetchUrl implements CrawlConstants
$pos = strrpos($url, "ETag:");
$url = substr_replace($url, "", $pos, strlen("ETag: ".$etag));
}
-
/* in queue_server we added the ip (if available)
after the url followed by ###
*/
@@ -499,13 +495,13 @@ class FetchUrl implements CrawlConstants
* and the web page returned. Parses out useful information from
* the header and return an array of these two parts and the useful info.
*
- * @param string& $header_and_page reference to string of downloaded data
+ * @param string $header_and_page string of downloaded data
* @param string $value field to store the page portion of page
* @return array info array consisting of a header, page for an http
* response, as well as parsed from the header the server, server
* version, operating system, encoding, and date information.
*/
- public static function parseHeaderPage(&$header_and_page,
+ public static function parseHeaderPage($header_and_page,
$value=CrawlConstants::PAGE)
{
$cache_page_validators = [];
@@ -601,6 +597,15 @@ class FetchUrl implements CrawlConstants
trim($robot_meta));
}
}
+ if (stristr($line, 'Content-Range:')) {
+ $line_parts = explode("/", $line);
+ if (!empty($line_parts[1])) {
+ $content_size = intval(trim($line_parts[1]));
+ if ($content_size > 0) {
+ $site[CrawlConstants::CONTENT_SIZE] = $content_size;
+ }
+ }
+ }
$canonical_regex = "/Link\:\s*\<\s*(http.*)\s*\>\s*\;\s*".
"rel\s*\=\s*(\"|')?canonical(\"|')?/";
// levenshtein gives notices on strings longer than 255
@@ -663,7 +668,6 @@ class FetchUrl implements CrawlConstants
} else {
$site[CrawlConstants::ENCODING] = $encoding_info;
}
-
if (!isset($site[CrawlConstants::SERVER]) ) {
$site[CrawlConstants::SERVER] = "unknown";
}
@@ -792,4 +796,4 @@ class FetchUrl implements CrawlConstants
crawlLog(wordwrap($response));
}
}
-}
\ No newline at end of file
+}