diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php
index 4794ee6c7..68846421b 100755
--- a/src/controllers/SearchController.php
+++ b/src/controllers/SearchController.php
@@ -724,7 +724,6 @@ class SearchController extends Controller implements CrawlConstants
$tmp_meta_words);
}
$crawl_model->index_name = $index_name;
-
$original_query = $query;
list($query, $raw, $use_network, $use_cache_if_possible,
$guess_semantics) =
@@ -2073,4 +2072,4 @@ class SearchController extends Controller implements CrawlConstants
$script->setAttributeNS("","src", C\NAME_SERVER."/scripts/history.js");
$node->appendChild($script);
}
-}
\ No newline at end of file
+}
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 2755368b5..0c3d1e198 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -158,6 +158,12 @@ class Fetcher implements CrawlConstants
* @var array
*/
public $disallowed_sites;
+ /**
+ * Microtime used to look up cache $allowed_sites and $disallowed_sites
+ * filtering data structures
+ * @var int
+ */
+ public $allow_disallow_cache_time;
/**
* Holds the parsed page rules which will be applied to document summaries
* before finally storing and indexing them
@@ -432,6 +438,7 @@ class Fetcher implements CrawlConstants
$this->restrict_sites_by_url = false;
$this->allowed_sites = [];
$this->disallowed_sites = [];
+ $this->allow_disallow_cache_time = microtime(true);
$this->page_rule_parser = null;
$this->video_sources = [];
$this->hosts_with_errors = [];
@@ -674,7 +681,6 @@ class Fetcher implements CrawlConstants
case self::WEB_CRAWL:
$downloaded_pages = $this->downloadPagesWebCrawl();
break;
-
case self::ARCHIVE_CRAWL:
if (isset($info[self::ARC_DATA])) {
$downloaded_pages = $info[self::ARC_DATA];
@@ -694,11 +700,9 @@ class Fetcher implements CrawlConstants
}
L\crawlLog("Number of summarized pages ".
count($summarized_site_pages));
-
$force_send = (isset($info[self::END_ITERATOR]) &&
$info[self::END_ITERATOR]) ? true : false;
$this->updateFoundSites($summarized_site_pages, $force_send);
-
$sleep_time = max(0, ceil($this->minimum_fetch_loop_time
- L\changeInMicrotime($start_time)));
if ($sleep_time > 0) {
@@ -707,7 +711,6 @@ class Fetcher implements CrawlConstants
sleep($sleep_time);
}
} //end while
-
L\crawlLog("Fetcher shutting down!!");
}
/**
@@ -1125,11 +1128,9 @@ class Fetcher implements CrawlConstants
L\crawlLog("Checking $queue_server for a new schedule.");
// hosts with error counts cleared with each schedule
$this->hosts_with_errors = [];
-
$start_time = microtime(true);
$time = time();
$session = md5($time . C\AUTH_KEY);
-
$request =
$queue_server."?c=fetch&a=schedule&time=$time&session=$session".
"&robot_instance=".$prefix . C\ROBOT_INSTANCE.
@@ -1145,7 +1146,6 @@ class Fetcher implements CrawlConstants
$tok = strtok($info_string, "\n");
$info = unserialize(base64_decode($tok));
$this->setCrawlParamsFromArray($info);
-
if (isset($info[self::SITES])) {
$tok = strtok("\n"); //skip meta info
$this->to_crawl = [];
@@ -1168,7 +1168,6 @@ class Fetcher implements CrawlConstants
self::fetch_crawl_info."{$this->crawl_time}.txt",
serialize($info));
}
-
L\crawlLog("Time to check Scheduler ".L\changeInMicrotime($start_time));
return $info;
}
@@ -2021,6 +2020,7 @@ class Fetcher implements CrawlConstants
}
}
}
+ $this->allow_disallow_cache_time = microtime(true);
for ($i = 0; $i < $count_again; $i++) {
L\crawlTimeoutLog("..still culling to crawl again urls. Examining ".
"location %s in queue of %s.", $i, $count);
@@ -2039,6 +2039,40 @@ class Fetcher implements CrawlConstants
L\crawlLog("...Removed $k cullable URLS from to crawl lists in time: ".
L\changeInMicrotime($start_time));
}
+ /**
+ * Checks if url belongs to a list of sites that are allowed to be
+ * crawled and that the file type is crawlable
+ *
+ * @param string $url url to check
+ * @return bool whether is allowed to be crawled or not
+ */
+ public function allowedToCrawlSite($url)
+ {
+ $doc_type = UrlParser::getDocumentType($url);
+ if (!in_array($doc_type, $this->all_file_types)) {
+ $doc_type = "unknown";
+ }
+ if (!in_array($doc_type, $this->indexed_file_types)) {
+ return false;
+ }
+ if ($this->restrict_sites_by_url) {
+ return UrlParser::urlMemberSiteArray($url, $this->allowed_sites,
+ "a" . $this->allow_disallow_cache_time);
+ }
+ return true;
+ }
+ /**
+ * Checks if url belongs to a list of sites that aren't supposed to be
+ * crawled
+ *
+ * @param string $url url to check
+ * @return bool whether is shouldn't be crawled
+ */
+ public function disallowedToCrawlSite($url)
+ {
+ return UrlParser::urlMemberSiteArray($url, $this->disallowed_sites,
+ "d" . $this->allow_disallow_cache_time);
+ }
/**
* Page processors are allowed to extract up to MAX_LINKS_TO_EXTRACT
* This method attempts to cull from the doc_info struct the
@@ -2179,21 +2213,33 @@ class Fetcher implements CrawlConstants
L\crawlLog(" Updating Found Sites Array...");
for ($i = 0; $i < count($sites); $i++) {
$site = $sites[$i];
- if (!isset($site[self::URL])) continue;
+ if (!isset($site[self::URL])) {
+ continue;
+ }
$host = UrlParser::getHost($site[self::URL]);
if (isset($site[self::ROBOT_PATHS])) {
- $this->found_sites[self::ROBOT_TXT][$host][self::IP_ADDRESSES] =
- $site[self::IP_ADDRESSES];
if ($site[self::IP_ADDRESSES] == ["0.0.0.0"]) {
- //probably couldn't find site so this will block from crawl
+ /* probably couldn't find site so this will block
+ from crawl
+ */
$site[self::ROBOT_PATHS][self::DISALLOWED_SITES] =
["/"];
}
- $this->found_sites[self::ROBOT_TXT][$host][self::ROBOT_PATHS] =
- $site[self::ROBOT_PATHS];
- if (isset($site[self::CRAWL_DELAY])) {
- $this->found_sites[self::ROBOT_TXT][$host][
- self::CRAWL_DELAY] = $site[self::CRAWL_DELAY];
+ //set same robots.txt for all redirects went through
+ $locations = [$host];
+ if (!empty($site[self::LOCATION])) {
+ $locations = array_merge($locations, $site[self::LOCATION]);
+ }
+ foreach ($locations as $location) {
+ $h = UrlParser::getHost($location);
+ $this->found_sites[self::ROBOT_TXT][$h][self::IP_ADDRESSES]=
+ $site[self::IP_ADDRESSES];
+ $this->found_sites[self::ROBOT_TXT][$h][self::ROBOT_PATHS] =
+ $site[self::ROBOT_PATHS];
+ if (isset($site[self::CRAWL_DELAY])) {
+ $this->found_sites[self::ROBOT_TXT][$h][
+ self::CRAWL_DELAY] = $site[self::CRAWL_DELAY];
+ }
}
if (isset($site[self::LINKS])
&& $this->crawl_type == self::WEB_CRAWL) {
@@ -2251,7 +2297,6 @@ class Fetcher implements CrawlConstants
} // end for
L\crawlLog(" Done Update Found Sites Array Time ".
L\changeInMicrotime($start_time));
-
if ($force_send || ($this->crawl_type == self::WEB_CRAWL &&
count($this->to_crawl) <= 0 && count($this->to_crawl_again) <= 0) ||
(isset($this->found_sites[self::SEEN_URLS]) &&
@@ -2348,13 +2393,10 @@ class Fetcher implements CrawlConstants
$current_server = $this->current_server;
$queue_server = $this->queue_servers[$current_server];
L\crawlLog("Updating machine: ".$queue_server);
-
$prefix = $this->fetcher_num."-";
-
if (count($this->to_crawl) <= 0) {
$schedule_time = $this->schedule_time;
}
-
/*
In what follows as we generate post data we delete stuff
from $this->found_sites, to try to minimize our memory
@@ -2367,7 +2409,6 @@ class Fetcher implements CrawlConstants
'robot_instance' => $prefix . C\ROBOT_INSTANCE, 'data' => '',
'check_crawl_time' => $this->check_crawl_time,
'crawl_type' => $this->crawl_type];
-
//handle robots.txt data
if (isset($this->found_sites[self::ROBOT_TXT])) {
$data = L\webencode(
@@ -2379,13 +2420,11 @@ class Fetcher implements CrawlConstants
$byte_counts["TOTAL"] += $bytes_robot;
$byte_counts["ROBOT"] = $bytes_robot;
}
-
//handle cache validation data
if (isset($this->found_sites[self::CACHE_PAGE_VALIDATION_DATA])) {
$cache_page_validation_data = L\webencode(
gzcompress(serialize(
$this->found_sites[self::CACHE_PAGE_VALIDATION_DATA])));
-
unset($this->found_sites[self::CACHE_PAGE_VALIDATION_DATA]);
$bytes_cache_page_validation = strlen($cache_page_validation_data);
$post_data['data'] .= $cache_page_validation_data;
@@ -2402,7 +2441,6 @@ class Fetcher implements CrawlConstants
$this->found_sites[self::TO_CRAWL][$current_server];
}
unset($this->found_sites[self::TO_CRAWL][$current_server]);
-
$seen_cnt = 0;
if (isset($this->found_sites[self::SEEN_URLS]) &&
($seen_cnt = count($this->found_sites[self::SEEN_URLS])) > 0 ) {
@@ -2411,7 +2449,7 @@ class Fetcher implements CrawlConstants
$hash_seen_urls[] =
L\crawlHash($site[self::URL], true);
}
- $schedule_data[self::HASH_SEEN_URLS] = & $hash_seen_urls;
+ $schedule_data[self::HASH_SEEN_URLS] = $hash_seen_urls;
unset($hash_seen_urls);
}
if (!empty($schedule_data)) {
@@ -2796,7 +2834,6 @@ class Fetcher implements CrawlConstants
$link_rank = false;
}
}
-
$num_queue_servers = count($this->queue_servers);
if (isset($site[self::USER_RANKS]) &&
count($site[self::USER_RANKS]) > 0) {
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 296a0a6cb..cdf8e649e 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -88,8 +88,8 @@ class QueueServer implements CrawlConstants, Join
*/
public $disallowed_sites;
/**
- * Used to cache $allowed_sites and $disallowed_sites filtering data
- * structures
+ * Microtime used to look up cache $allowed_sites and $disallowed_sites
+ * filtering data structures
* @var int
*/
public $allow_disallow_cache_time;
@@ -281,7 +281,6 @@ class QueueServer implements CrawlConstants, Join
$this->indexed_file_types = PageProcessor::$indexed_file_types;
$this->all_file_types = PageProcessor::$indexed_file_types;
$this->most_recent_fetcher = "No Fetcher has spoken with me";
-
//the next values will be set for real in startCrawl
$this->crawl_order = self::PAGE_IMPORTANCE;
$this->summarizer_option = self::CENTROID_SUMMARIZER;
@@ -2003,9 +2002,7 @@ class QueueServer implements CrawlConstants, Join
}
}
$this->web_queue->notifyFlush();
-
L\crawlLog(" time: ".L\changeInMicrotime($start_time));
-
L\crawlLog("C.. Scheduler: Add urls to queue");
$start_time = microtime(true);
/*
@@ -2296,7 +2293,6 @@ class QueueServer implements CrawlConstants, Join
}
$delay = $this->web_queue->getCrawlDelay($host_url);
}
-
if (!$this->withinQuota($url)) {
//we've not allowed to schedule $url till next hour
$delete_urls[$i] = $url;
@@ -2320,7 +2316,6 @@ class QueueServer implements CrawlConstants, Join
$hash_host;
$request_batches_per_delay =
ceil($delay/$time_per_request_guess);
-
if (!isset($crawl_delay_hosts[$hash_host])) {
$next_earliest_slot = $current_crawl_index;
$crawl_delay_hosts[$hash_host]= $next_earliest_slot;
@@ -2591,7 +2586,7 @@ class QueueServer implements CrawlConstants, Join
}
if ($this->restrict_sites_by_url) {
return UrlParser::urlMemberSiteArray($url, $this->allowed_sites,
- "a".$this->allow_disallow_cache_time);
+ "a" . $this->allow_disallow_cache_time);
}
return true;
}
@@ -2605,7 +2600,7 @@ class QueueServer implements CrawlConstants, Join
public function disallowedToCrawlSite($url)
{
return UrlParser::urlMemberSiteArray($url, $this->disallowed_sites,
- "d".$this->allow_disallow_cache_time);
+ "d" . $this->allow_disallow_cache_time);
}
/**
* Checks if the $url is from a site which has an hourly quota to download.
@@ -2645,4 +2640,4 @@ if (!C\nsdefined("UNIT_TEST_MODE")) {
*/
$queue_server = new QueueServer();
$queue_server->start();
-}
\ No newline at end of file
+}
diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php
index f28da4fbf..c1836247d 100755
--- a/src/library/UrlParser.php
+++ b/src/library/UrlParser.php
@@ -243,10 +243,8 @@ class UrlParser
public static function getHost($url, $with_login_and_port = true)
{
$url_parts = @parse_url($url);
-
if (!isset($url_parts['scheme']) ) {return false;}
$host_url = $url_parts['scheme'].'://';
-
//handles common typo http:/yahoo.com rather than http://yahoo.com
if (!isset($url_parts['host'])) {
if (isset($url_parts['path'])) {
@@ -800,7 +798,9 @@ class UrlParser
$name, $return_rule = false)
{
static $cache = [];
- if (!is_array($site_array)) {return false;}
+ if (!is_array($site_array)) {
+ return false;
+ }
if (!isset($cache[$name])) {
if (count($cache) > 100) {
$cache = [];
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index ee93a6637..bdb2311d4 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -77,6 +77,10 @@ class PhraseModel extends ParallelModel
* @var string
*/
public $program_indicator;
+ /**
+ * Length of info hash record phrse
+ */
+ const INFO_HASH_LEN = 16;
/**
* Number of pages to cache in one go in memcache or filecache
* Size chosen based on 1MB max object size for memcache or filecache
@@ -320,7 +324,10 @@ class PhraseModel extends ParallelModel
$prs_cnt++;
}
$cache_results = false;
- $phrase = mb_strtolower($phrase);
+ if (mb_strlen($phrase) > self::INFO_HASH_LEN ||
+ mb_substr($phrase, 0, 5) != "info:") {
+ $phrase = mb_strtolower($phrase);
+ }
if (C\nsdefined('USE_CACHE') && C\USE_CACHE &&
$save_timestamp == "" && $use_cache_if_allowed && !$network) {
$cache_results = self::$cache->get($phrase . $this->index_name);
@@ -452,7 +459,6 @@ class PhraseModel extends ParallelModel
} elseif (isset($results['PAGES'])) {
$results['TOTAL_ROWS'] = count($results['PAGES']);
}
-
if ($format) {
if (count($format_words) == 0) {
$format_words = null;
@@ -850,10 +856,13 @@ class PhraseModel extends ParallelModel
}
$phrase = $this->beginMatch($phrase, "www.", "site:www.");
$phrase = $this->beginMatch($phrase, "http:", "site:http:");
- $phrase = $this->beginMatch($phrase, "info:", "info:http://", "/",
- ["/"]);
- $phrase = $this->beginMatch($phrase, "info:", "info:http://", "",
- ["http"]);
+ // only rewrite info if longer than hash info record length
+ if($len > self::INFO_HASH_LEN) {
+ $phrase = $this->beginMatch($phrase, "info:", "info:http://",
+ "/", ["/"]);
+ $phrase = $this->beginMatch($phrase, "info:", "info:http://",
+ "", ["http"]);
+ }
}
$tag = L\guessLocaleFromString($phrase);
if (isset($this->programming_language_map[$tag])) {