diff --git a/bin/fetcher.php b/bin/fetcher.php index d7146a79c..9edceda04 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -1360,7 +1360,9 @@ class Fetcher implements CrawlConstants foreach($site_pages as $site) { $response_code = $site[self::HTTP_CODE]; + $was_error = false; if($response_code < 200 || $response_code >= 300) { + $was_error = true; crawlLog($site[self::URL]." response code $response_code"); $host = UrlParser::getHost($site[self::URL]); if(!isset($this->hosts_with_errors[$host])) { @@ -1376,11 +1378,14 @@ class Fetcher implements CrawlConstants this makes sure we don't crawl it again */ } - // text/robot is my made up mimetype for robots.txt files if(isset($site[self::ROBOT_PATHS])) { $site[self::GOT_ROBOT_TXT] = true; - $type = "text/robot"; + if(!$was_error) { + $type = "text/robot"; + } else { + $type = $site[self::TYPE]; + } } else { $type = $site[self::TYPE]; } @@ -1417,6 +1422,8 @@ class Fetcher implements CrawlConstants $text_data =false; } } else { + crawlLog("No page processor for mime type: ".$type); + crawlLog("Not processing: ".$site[self::URL]); continue; } if(!$handled) { @@ -1508,7 +1515,7 @@ class Fetcher implements CrawlConstants if(isset($doc_info[self::CRAWL_DELAY])) { $site[self::CRAWL_DELAY] = $doc_info[self::CRAWL_DELAY]; } - if(isset($doc_info[self::ROBOT_PATHS])) { + if(isset($doc_info[self::ROBOT_PATHS]) && !$was_error) { $site[self::ROBOT_PATHS] = $doc_info[self::ROBOT_PATHS]; } if(!isset($site[self::ROBOT_METAS])) { diff --git a/bin/queue_server.php b/bin/queue_server.php index 9b0ffc118..4e71a3c8d 100755 --- a/bin/queue_server.php +++ b/bin/queue_server.php @@ -1719,11 +1719,19 @@ class QueueServer implements CrawlConstants, Join } foreach($to_crawl_sites as $triple) { $url = & $triple[0]; - if(strlen($url) < 7) continue; // strlen("http://") + if(strlen($url) < 7) { // strlen("http://") + continue; + } + if($url[0] != 'h' && trim($url) == "localhost") { + $url = "http://localhost/"; + } $weight = $triple[1]; $this->web_queue->addSeenUrlFilter($triple[2]); //add for dedup unset($triple[2]); // so triple is now a pair $host_url = UrlParser::getHost($url); + if(strlen($host_url) < 7) { // strlen("http://") + continue; + } $host_with_robots = $host_url."/robots.txt"; $robots_in_queue = $this->web_queue->containsUrlQueue($host_with_robots); @@ -1928,7 +1936,7 @@ class QueueServer implements CrawlConstants, Join // if queue error remove entry any loop if($tmp === false || strcmp($url, "LOOKUP ERROR") == 0) { $delete_urls[$i] = $url; - crawlLog("Removing lookup error index during produce fetch"); + crawlLog("Removing lookup error for $url during produce fetch"); $i++; continue; } diff --git a/configs/config.php b/configs/config.php index 45c9c3819..b2e841e0f 100644 --- a/configs/config.php +++ b/configs/config.php @@ -417,6 +417,7 @@ $PAGE_PROCESSORS = array( "text/html" => "HtmlProcessor", "text/rtf" => "RtfProcessor", "text/plain" => "TextProcessor", "text/csv" => "TextProcessor", + "text/x-java-source" => "TextProcessor", "text/tab-separated-values" => "TextProcessor", "image/jpeg" => "JpgProcessor", "image/gif" => "GifProcessor", diff --git a/controllers/search_controller.php b/controllers/search_controller.php index 54820450d..8fc3b4cb9 100755 --- a/controllers/search_controller.php +++ b/controllers/search_controller.php @@ -1984,7 +1984,7 @@ class SearchController extends Controller implements CrawlConstants $url_encoded = urlencode($arr[3]); $link_text = $dom->createTextNode("$arr[0] $arr[1] ". "$arr[2]"); - $link = $this->baseAddress."&a=cache&". + $link = $this->baseLink()."&a=cache&". "q=$terms&arg=$url_encoded&its=$arr[4]&hist_open=true"; $link_dom = $dom->createElement("a"); $link_dom->setAttributeNS("", "href", $link); @@ -2110,4 +2110,4 @@ class SearchController extends Controller implements CrawlConstants $node->appendChild($script); } } -?> \ No newline at end of file +?> diff --git a/lib/fetch_url.php b/lib/fetch_url.php index 03f67511b..fe2e68589 100755 --- a/lib/fetch_url.php +++ b/lib/fetch_url.php @@ -249,7 +249,7 @@ class FetchUrl implements CrawlConstants $len = strlen(inet_pton($ip_address)); if($len == 4 || $len == 16) { if($len == 16) { - $ip_address= "[$ip_address]"; + $ip_address= "[$ip_address]"; } if(count($url_ip_parts) > 1) { $url = implode("###", $url_ip_parts); @@ -467,9 +467,20 @@ class FetchUrl implements CrawlConstants */ static function getCurlIp($header) { - if (preg_match_all('/Trying\s+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b/', + if (preg_match_all('/Trying\s+(.*)\b/', $header, $matches)) { - return array_unique($matches[1]); + $out_addresses = array(); + $addresses = array_unique($matches[1]); + foreach($addresses as $address) { + $num = @inet_pton($address); + if($num !== false) { + $out_addresses[] = $address; + } + } + if($out_addresses != array()) { + return $out_addresses; + } + return false; } else { return false; } diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php index 50647d855..e63603639 100755 --- a/lib/processors/html_processor.php +++ b/lib/processors/html_processor.php @@ -99,7 +99,6 @@ class HtmlProcessor extends TextProcessor $summary = parent::process($page, $url); } } - return $summary; } @@ -368,8 +367,9 @@ class HtmlProcessor extends TextProcessor if($rel == "" || !stristr($rel, "nofollow")) { $url = UrlParser::canonicalLink( $href->getAttribute('href'), $site); + $len = strlen($url); if(!UrlParser::checkRecursiveUrl($url) && - strlen($url) < MAX_URL_LENGTH) { + strlen($url) < MAX_URL_LENGTH && $len > 4) { if(isset($sites[$url])) { $sites[$url] .=" .. ". strip_tags($href->textContent); @@ -389,9 +389,10 @@ class HtmlProcessor extends TextProcessor if($i < MAX_LINKS_TO_EXTRACT) { $url = UrlParser::canonicalLink( $frame->getAttribute('src'), $site); + $len = strlen($url); if(!UrlParser::checkRecursiveUrl($url) - && strlen($url) < MAX_URL_LENGTH) { + && $len < MAX_URL_LENGTH && $len > 4) { if(isset($sites[$url]) ) { $sites[$url] .=" .. HTMLframe"; } else { @@ -415,8 +416,9 @@ class HtmlProcessor extends TextProcessor $url = UrlParser::canonicalLink( $img->getAttribute('src'), $site); + $len = strlen($url); if(!UrlParser::checkRecursiveUrl($url) - && strlen($url) < MAX_URL_LENGTH) { + && $len < MAX_URL_LENGTH && $len > 4) { if(isset($sites[$url]) ) { $sites[$url] .=" .. ".$alt; } else { diff --git a/lib/processors/pptx_processor.php b/lib/processors/pptx_processor.php index 470a5387f..222078bfe 100644 --- a/lib/processors/pptx_processor.php +++ b/lib/processors/pptx_processor.php @@ -161,8 +161,9 @@ class PptxProcessor extends TextProcessor $url = UrlParser::canonicalLink( $hlink, $site); + $len = strlen($url); if(!UrlParser::checkRecursiveUrl($url) && - strlen($url) < MAX_URL_LENGTH) { + strlen($url) < MAX_URL_LENGTH && $len > 0) { if(isset($sites[$url])) { $sites[$url] .= " ".$hlink; } else { diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php index e61d002c6..801d3000f 100755 --- a/lib/processors/text_processor.php +++ b/lib/processors/text_processor.php @@ -177,7 +177,8 @@ class TextProcessor extends PageProcessor preg_match_all($pattern, $page, $matches); $i = 0; foreach($matches[0] as $url) { - if(!isset($sites[$url]) && strlen($url) < MAX_URL_LENGTH) { + if(!isset($sites[$url]) && strlen($url) < MAX_URL_LENGTH && + strlen($url) > 4) { $sites[$url] = strip_tags($url); $i++; if($i >= MAX_LINKS_TO_EXTRACT) {break;}