Add log message in case downloaded file of unknown mime type, fixed bug in how localhost urls processed, a=chris

Chris Pollett [2013-04-10 21:Apr:th]
Add log message in case downloaded file of unknown mime type, fixed bug in how localhost urls processed, a=chris
Filename
bin/fetcher.php
bin/queue_server.php
configs/config.php
controllers/search_controller.php
lib/fetch_url.php
lib/processors/html_processor.php
lib/processors/pptx_processor.php
lib/processors/text_processor.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index d7146a79c..9edceda04 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -1360,7 +1360,9 @@ class Fetcher implements CrawlConstants

         foreach($site_pages as $site) {
             $response_code = $site[self::HTTP_CODE];
+            $was_error = false;
             if($response_code < 200 || $response_code >= 300) {
+                $was_error = true;
                 crawlLog($site[self::URL]." response code $response_code");
                 $host = UrlParser::getHost($site[self::URL]);
                 if(!isset($this->hosts_with_errors[$host])) {
@@ -1376,11 +1378,14 @@ class Fetcher implements CrawlConstants
                    this makes sure we don't crawl it again
                 */
             }
-
             // text/robot is my made up mimetype for robots.txt files
             if(isset($site[self::ROBOT_PATHS])) {
                 $site[self::GOT_ROBOT_TXT] = true;
-                $type = "text/robot";
+                if(!$was_error) {
+                    $type = "text/robot";
+                } else {
+                    $type = $site[self::TYPE];
+                }
             } else {
                 $type = $site[self::TYPE];
             }
@@ -1417,6 +1422,8 @@ class Fetcher implements CrawlConstants
                     $text_data =false;
                 }
             } else {
+                crawlLog("No page processor for mime type: ".$type);
+                crawlLog("Not processing: ".$site[self::URL]);
                 continue;
             }
             if(!$handled) {
@@ -1508,7 +1515,7 @@ class Fetcher implements CrawlConstants
                 if(isset($doc_info[self::CRAWL_DELAY])) {
                     $site[self::CRAWL_DELAY] = $doc_info[self::CRAWL_DELAY];
                 }
-                if(isset($doc_info[self::ROBOT_PATHS])) {
+                if(isset($doc_info[self::ROBOT_PATHS]) && !$was_error) {
                     $site[self::ROBOT_PATHS] = $doc_info[self::ROBOT_PATHS];
                 }
                 if(!isset($site[self::ROBOT_METAS])) {
diff --git a/bin/queue_server.php b/bin/queue_server.php
index 9b0ffc118..4e71a3c8d 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -1719,11 +1719,19 @@ class QueueServer implements CrawlConstants, Join
             }
             foreach($to_crawl_sites as $triple) {
                 $url = & $triple[0];
-                if(strlen($url) < 7) continue; // strlen("http://")
+                if(strlen($url) < 7) { // strlen("http://")
+                    continue;
+                }
+                if($url[0] != 'h' && trim($url) == "localhost") {
+                    $url = "http://localhost/";
+                }
                 $weight = $triple[1];
                 $this->web_queue->addSeenUrlFilter($triple[2]); //add for dedup
                 unset($triple[2]); // so triple is now a pair
                 $host_url = UrlParser::getHost($url);
+                if(strlen($host_url) < 7) { // strlen("http://")
+                    continue;
+                }
                 $host_with_robots = $host_url."/robots.txt";
                 $robots_in_queue =
                     $this->web_queue->containsUrlQueue($host_with_robots);
@@ -1928,7 +1936,7 @@ class QueueServer implements CrawlConstants, Join
             // if queue error remove entry any loop
             if($tmp === false || strcmp($url, "LOOKUP ERROR") == 0) {
                 $delete_urls[$i] = $url;
-                crawlLog("Removing lookup error index during produce fetch");
+                crawlLog("Removing lookup error for $url during produce fetch");
                 $i++;
                 continue;
             }
diff --git a/configs/config.php b/configs/config.php
index 45c9c3819..b2e841e0f 100644
--- a/configs/config.php
+++ b/configs/config.php
@@ -417,6 +417,7 @@ $PAGE_PROCESSORS = array(   "text/html" => "HtmlProcessor",
                             "text/rtf" => "RtfProcessor",
                             "text/plain" => "TextProcessor",
                             "text/csv" => "TextProcessor",
+                            "text/x-java-source" => "TextProcessor",
                             "text/tab-separated-values" => "TextProcessor",
                             "image/jpeg" => "JpgProcessor",
                             "image/gif" => "GifProcessor",
diff --git a/controllers/search_controller.php b/controllers/search_controller.php
index 54820450d..8fc3b4cb9 100755
--- a/controllers/search_controller.php
+++ b/controllers/search_controller.php
@@ -1984,7 +1984,7 @@ class SearchController extends Controller implements CrawlConstants
                     $url_encoded = urlencode($arr[3]);
                     $link_text = $dom->createTextNode("$arr[0] $arr[1] ".
                             "$arr[2]");
-                    $link = $this->baseAddress."&a=cache&".
+                    $link = $this->baseLink()."&a=cache&".
                         "q=$terms&arg=$url_encoded&its=$arr[4]&hist_open=true";
                     $link_dom = $dom->createElement("a");
                         $link_dom->setAttributeNS("", "href", $link);
@@ -2110,4 +2110,4 @@ class SearchController extends Controller implements CrawlConstants
         $node->appendChild($script);
     }
 }
-?>
\ No newline at end of file
+?>
diff --git a/lib/fetch_url.php b/lib/fetch_url.php
index 03f67511b..fe2e68589 100755
--- a/lib/fetch_url.php
+++ b/lib/fetch_url.php
@@ -249,7 +249,7 @@ class FetchUrl implements CrawlConstants
                 $len = strlen(inet_pton($ip_address));
                 if($len == 4 || $len == 16) {
                     if($len == 16) {
-                        $ip_address= "[$ip_address]";
+                      $ip_address= "[$ip_address]";
                     }
                     if(count($url_ip_parts) > 1) {
                         $url = implode("###", $url_ip_parts);
@@ -467,9 +467,20 @@ class FetchUrl implements CrawlConstants
      */
     static function getCurlIp($header)
     {
-        if (preg_match_all('/Trying\s+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b/',
+        if (preg_match_all('/Trying\s+(.*)\b/',
             $header, $matches)) {
-            return array_unique($matches[1]);
+            $out_addresses = array();
+            $addresses = array_unique($matches[1]);
+            foreach($addresses as $address) {
+                $num = @inet_pton($address);
+                if($num !== false) {
+                    $out_addresses[] = $address;
+                }
+            }
+            if($out_addresses != array()) {
+                return $out_addresses;
+            }
+            return false;
         } else {
             return false;
         }
diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php
index 50647d855..e63603639 100755
--- a/lib/processors/html_processor.php
+++ b/lib/processors/html_processor.php
@@ -99,7 +99,6 @@ class HtmlProcessor extends TextProcessor
                 $summary = parent::process($page, $url);
             }
         }
-
         return $summary;

     }
@@ -368,8 +367,9 @@ class HtmlProcessor extends TextProcessor
                 if($rel == "" || !stristr($rel, "nofollow")) {
                     $url = UrlParser::canonicalLink(
                         $href->getAttribute('href'), $site);
+                    $len = strlen($url);
                     if(!UrlParser::checkRecursiveUrl($url)  &&
-                        strlen($url) < MAX_URL_LENGTH) {
+                        strlen($url) < MAX_URL_LENGTH && $len > 4) {
                         if(isset($sites[$url])) {
                             $sites[$url] .=" .. ".
                                 strip_tags($href->textContent);
@@ -389,9 +389,10 @@ class HtmlProcessor extends TextProcessor
             if($i < MAX_LINKS_TO_EXTRACT) {
                 $url = UrlParser::canonicalLink(
                     $frame->getAttribute('src'), $site);
+                $len = strlen($url);

                 if(!UrlParser::checkRecursiveUrl($url)
-                    && strlen($url) < MAX_URL_LENGTH) {
+                    && $len < MAX_URL_LENGTH && $len > 4) {
                     if(isset($sites[$url]) ) {
                         $sites[$url] .=" .. HTMLframe";
                     } else {
@@ -415,8 +416,9 @@ class HtmlProcessor extends TextProcessor

                 $url = UrlParser::canonicalLink(
                     $img->getAttribute('src'), $site);
+                $len = strlen($url);
                 if(!UrlParser::checkRecursiveUrl($url)
-                    && strlen($url) < MAX_URL_LENGTH) {
+                    && $len < MAX_URL_LENGTH && $len > 4) {
                     if(isset($sites[$url]) ) {
                         $sites[$url] .=" .. ".$alt;
                     } else {
diff --git a/lib/processors/pptx_processor.php b/lib/processors/pptx_processor.php
index 470a5387f..222078bfe 100644
--- a/lib/processors/pptx_processor.php
+++ b/lib/processors/pptx_processor.php
@@ -161,8 +161,9 @@ class PptxProcessor extends TextProcessor

                 $url = UrlParser::canonicalLink(
                     $hlink, $site);
+                $len = strlen($url);
                 if(!UrlParser::checkRecursiveUrl($url)  &&
-                    strlen($url) < MAX_URL_LENGTH) {
+                    strlen($url) < MAX_URL_LENGTH && $len > 0) {
                     if(isset($sites[$url])) {
                         $sites[$url] .= " ".$hlink;
                     } else {
diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php
index e61d002c6..801d3000f 100755
--- a/lib/processors/text_processor.php
+++ b/lib/processors/text_processor.php
@@ -177,7 +177,8 @@ class TextProcessor extends PageProcessor
         preg_match_all($pattern, $page, $matches);
         $i = 0;
         foreach($matches[0] as $url) {
-            if(!isset($sites[$url]) && strlen($url) < MAX_URL_LENGTH) {
+            if(!isset($sites[$url]) && strlen($url) < MAX_URL_LENGTH &&
+                strlen($url) > 4) {
                 $sites[$url] = strip_tags($url);
                 $i++;
                 if($i >= MAX_LINKS_TO_EXTRACT) {break;}
ViewGit