Improves allow landing crawl support in cases where there is also a robots meta tag, a=chris

Chris Pollett [2022-07-28 21:Jul:th]
Improves allow landing crawl support in cases where there is also a robots meta tag, a=chris
Filename
src/executables/Fetcher.php
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index cfdf16f86..c21fdd4e9 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -500,6 +500,7 @@ class Fetcher implements CrawlConstants
             'py' => 'py'];
         //we will get the correct crawl order from a queue_server
         $this->crawl_order = self::HOST_BUDGETING;
+        $this->robots_txt = C\ALWAYS_FOLLOW_ROBOTS;
         $this->max_depth = -1;
         $this->summarizer_option = self::BASIC_SUMMARIZER;
         $this->debug = "";
@@ -1489,6 +1490,7 @@ class Fetcher implements CrawlConstants
             self::MINIMUM_FETCH_LOOP_TIME => 'minimum_fetch_loop_time',
             self::PROXY_SERVERS => 'proxy_servers',
             self::RESTRICT_SITES_BY_URL => 'restrict_sites_by_url',
+            self::ROBOTS_TXT => 'robots_txt',
             self::SUMMARIZER_OPTION => "summarizer_option",
             self::TOR_PROXY => 'tor_proxy'];
         $check_cull_fields = ["restrict_sites_by_url", "allowed_sites",
@@ -2037,6 +2039,13 @@ class Fetcher implements CrawlConstants
                     $site[self::ROBOT_METAS] = array_merge(
                         $site[self::ROBOT_METAS], $doc_info[self::ROBOT_METAS]);
                 }
+                if ($this->robots_txt == C\IGNORE_ROBOTS) {
+                    $site[self::ROBOT_METAS] = [];
+                }
+                if ($this->robots_txt == C\ALLOW_LANDING_ROBOTS) {
+                    $site[self::ROBOT_METAS] = array_diff(
+                        $site[self::ROBOT_METAS], ["NOINDEX", "NONE"]);
+                }
                 //here's where we enforce NOFOLLOW
                 if (in_array("NOFOLLOW", $site[self::ROBOT_METAS]) ||
                     in_array("NONE", $site[self::ROBOT_METAS])) {
ViewGit