Improves allow landing crawl support in cases where there is also a robots meta tag, a=chris
Improves allow landing crawl support in cases where there is also a robots meta tag, a=chris
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index cfdf16f86..c21fdd4e9 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -500,6 +500,7 @@ class Fetcher implements CrawlConstants
'py' => 'py'];
//we will get the correct crawl order from a queue_server
$this->crawl_order = self::HOST_BUDGETING;
+ $this->robots_txt = C\ALWAYS_FOLLOW_ROBOTS;
$this->max_depth = -1;
$this->summarizer_option = self::BASIC_SUMMARIZER;
$this->debug = "";
@@ -1489,6 +1490,7 @@ class Fetcher implements CrawlConstants
self::MINIMUM_FETCH_LOOP_TIME => 'minimum_fetch_loop_time',
self::PROXY_SERVERS => 'proxy_servers',
self::RESTRICT_SITES_BY_URL => 'restrict_sites_by_url',
+ self::ROBOTS_TXT => 'robots_txt',
self::SUMMARIZER_OPTION => "summarizer_option",
self::TOR_PROXY => 'tor_proxy'];
$check_cull_fields = ["restrict_sites_by_url", "allowed_sites",
@@ -2037,6 +2039,13 @@ class Fetcher implements CrawlConstants
$site[self::ROBOT_METAS] = array_merge(
$site[self::ROBOT_METAS], $doc_info[self::ROBOT_METAS]);
}
+ if ($this->robots_txt == C\IGNORE_ROBOTS) {
+ $site[self::ROBOT_METAS] = [];
+ }
+ if ($this->robots_txt == C\ALLOW_LANDING_ROBOTS) {
+ $site[self::ROBOT_METAS] = array_diff(
+ $site[self::ROBOT_METAS], ["NOINDEX", "NONE"]);
+ }
//here's where we enforce NOFOLLOW
if (in_array("NOFOLLOW", $site[self::ROBOT_METAS]) ||
in_array("NONE", $site[self::ROBOT_METAS])) {