try to prevent malformed images from crashing fetcher, a=chris

Chris Pollett [2017-01-19 19:Jan:th]

try to prevent malformed images from crashing fetcher, a=chris

Filename
src/executables/Fetcher.php
src/library/CrawlConstants.php
src/library/FetchUrl.php

diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 1d67444ea..2755368b5 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -1782,7 +1782,16 @@ class Fetcher implements CrawlConstants
                     "HtmlProcessor")) {
                     $processor->scrapers = $this->scrapers;
                 }
-                $doc_info = $processor->handle($site[self::PAGE],
+                $page = $site[self::PAGE];
+                if (L\generalIsA($page_processor, C\NS_PROCESSORS.
+                    "ImageProcessor")) {
+                    if (!empty($site[self::CONTENT_SIZE]) &&
+                        !empty($site[self::SIZE]) && $site[self::CONTENT_SIZE] >
+                        $site[self::SIZE]) {
+                        $page = "";
+                    }
+                }
+                $doc_info = $processor->handle($page,
                     $site[self::URL]);
                 if (C\FETCHER_PROCESS_DELAY > 0 ) {
                     usleep(C\FETCHER_PROCESS_DELAY);
@@ -2898,4 +2907,4 @@ class Fetcher implements CrawlConstants
  * Instantiate and runs the Fetcher
  */
 $fetcher =  new Fetcher();
-$fetcher->start();
\ No newline at end of file
+$fetcher->start();
diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php
index a0bb01eeb..60bab64bd 100755
--- a/src/library/CrawlConstants.php
+++ b/src/library/CrawlConstants.php
@@ -233,6 +233,6 @@ interface CrawlConstants
     const SCRAPERS = 'dv';
     const IS_NEWS = "dw";
     const QUESTION_ANSWERS = 'dx';
-    const CONTENT_LENGTH = 'dy';
+    const CONTENT_SIZE = 'dy';
     const NO_RANGE = 'dz';
 }
diff --git a/src/library/FetchUrl.php b/src/library/FetchUrl.php
index ca5339fd8..9f439912e 100755
--- a/src/library/FetchUrl.php
+++ b/src/library/FetchUrl.php
@@ -264,9 +264,6 @@ class FetchUrl implements CrawlConstants
                 if (!$minimal) {
                     $sites[$i][self::SIZE] = @curl_getinfo($sites[$i][0],
                         CURLINFO_SIZE_DOWNLOAD);
-                    $site[$i][self::CONTENT_LENGTH] =
-                        @curl_getinfo($sites[$i][0],
-                        CURLINFO_CONTENT_LENGTH_DOWNLOAD);
                     $sites[$i][self::DNS_TIME] = @curl_getinfo($sites[$i][0],
                         CURLINFO_NAMELOOKUP_TIME);
                     $sites[$i][self::TOTAL_TIME] = @curl_getinfo($sites[$i][0],
@@ -409,7 +406,6 @@ class FetchUrl implements CrawlConstants
             $pos = strrpos($url, "ETag:");
             $url = substr_replace($url, "", $pos, strlen("ETag: ".$etag));
         }
-
         /* in queue_server we added the ip (if available)
           after the url followed by ###
          */
@@ -499,13 +495,13 @@ class FetchUrl implements CrawlConstants
      * and the web page returned. Parses out useful information from
      * the header and return an array of these two parts and the useful info.
      *
-     * @param string& $header_and_page reference to string of downloaded data
+     * @param string $header_and_page string of downloaded data
      * @param string $value field to store the page portion of page
      * @return array info array consisting of a header, page for an http
      *     response, as well as parsed from the header the server, server
      *     version, operating system, encoding, and date information.
      */
-    public static function parseHeaderPage(&$header_and_page,
+    public static function parseHeaderPage($header_and_page,
         $value=CrawlConstants::PAGE)
     {
         $cache_page_validators = [];
@@ -601,6 +597,15 @@ class FetchUrl implements CrawlConstants
                         trim($robot_meta));
                 }
             }
+            if (stristr($line, 'Content-Range:')) {
+                $line_parts = explode("/", $line);
+                if (!empty($line_parts[1])) {
+                    $content_size = intval(trim($line_parts[1]));
+                    if ($content_size > 0) {
+                        $site[CrawlConstants::CONTENT_SIZE] = $content_size;
+                    }
+                }
+            }
             $canonical_regex = "/Link\:\s*\<\s*(http.*)\s*\>\s*\;\s*".
                 "rel\s*\=\s*(\"|')?canonical(\"|')?/";
             // levenshtein gives notices on strings longer than 255
@@ -663,7 +668,6 @@ class FetchUrl implements CrawlConstants
         } else {
             $site[CrawlConstants::ENCODING] = $encoding_info;
         }
-
         if (!isset($site[CrawlConstants::SERVER]) ) {
             $site[CrawlConstants::SERVER] = "unknown";
         }
@@ -792,4 +796,4 @@ class FetchUrl implements CrawlConstants
             crawlLog(wordwrap($response));
         }
     }
-}
\ No newline at end of file
+}

ViewGit