Add a new constant for alternative user agent, also does a check to try to avoid a top-level link being a redirect, a=chris

Chris Pollett [2022-08-16 16:Aug:th]
Add a new constant for alternative user agent, also does a check to try to avoid a top-level link being a redirect, a=chris
Filename
src/configs/Config.php
src/library/FetchUrl.php
src/library/processors/HtmlProcessor.php
src/views/elements/SearchcalloutElement.php
diff --git a/src/configs/Config.php b/src/configs/Config.php
index eb09cada4..f11a35097 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -652,10 +652,10 @@ if (!PROFILE) {
     return;
 }
 /*+++ End machine generated code, feel free to edit the below as desired +++*/
-/** this is the User-Agent names the crawler provides
+/** This is the User-Agent name the crawler provides
  * a web-server it is crawling
  */
-if (defined("seekquarry\\yioop\\config\REDIRECTS_ON") &&
+if (defined("seekquarry\\yioop\\configs\REDIRECTS_ON") &&
     REDIRECTS_ON) {
     nsconddefine('USER_AGENT',
         'Mozilla/5.0 (compatible; '.USER_AGENT_SHORT.'; +'.NAME_SERVER.'bot)');
@@ -668,6 +668,15 @@ if (defined("seekquarry\\yioop\\config\REDIRECTS_ON") &&
         'Mozilla/5.0 (compatible; ' .
         USER_AGENT_SHORT.'; +' . NAME_SERVER . 'bot.php)');
 }
+/**
+ * This is the User-Agent name the crawler provides for hosts
+ * in the list USER_AGENT_ALTERNATIVE_HOSTS
+ */
+nsconddefine('USER_AGENT_ALTERNATIVE', "Mozilla/5.0");
+/**
+ * An array of hosts to use the alternative user agent for
+ */
+nsconddefine('USER_AGENT_ALTERNATIVE_HOSTS', []);
 /**
  * To change the Open Search Tool bar name override the following variable
  * in your LocalConfig.php file
diff --git a/src/library/FetchUrl.php b/src/library/FetchUrl.php
index b68c141da..857dd3cc5 100755
--- a/src/library/FetchUrl.php
+++ b/src/library/FetchUrl.php
@@ -159,7 +159,13 @@ class FetchUrl implements CrawlConstants
                 curl_setopt($sites[$i][0], CURLOPT_STDERR, $ip_holder[$i]);
                 curl_setopt($sites[$i][0], CURLOPT_VERBOSE, true);
             }
-            curl_setopt($sites[$i][0], CURLOPT_USERAGENT, C\USER_AGENT);
+            if (in_array(UrlParser::getHost($url),
+                C\USER_AGENT_ALTERNATIVE_HOSTS)) {
+                curl_setopt($sites[$i][0], CURLOPT_USERAGENT,
+                    C\USER_AGENT_ALTERNATIVE);
+            } else {
+                curl_setopt($sites[$i][0], CURLOPT_USERAGENT, C\USER_AGENT);
+            }
             curl_setopt($sites[$i][0], CURLOPT_IPRESOLVE,
                 CURL_IPRESOLVE_WHATEVER);
             if (!empty($dns_resolve)) {
@@ -923,7 +929,13 @@ class FetchUrl implements CrawlConstants
         if ($not_web_setting) {
             crawlLog("  Init curl request of a single page");
         }
-        curl_setopt($agents[$host], CURLOPT_USERAGENT, C\USER_AGENT);
+        if (in_array(UrlParser::getHost($site),
+            C\USER_AGENT_ALTERNATIVE_HOSTS)) {
+            curl_setopt($agents[$host], CURLOPT_USERAGENT,
+                C\USER_AGENT_ALTERNATIVE);
+        } else {
+            curl_setopt($agents[$host], CURLOPT_USERAGENT, C\USER_AGENT);
+        }
         curl_setopt($agents[$host], CURLOPT_URL, $site);
         curl_setopt($agents[$host], CURLOPT_AUTOREFERER, true);
         curl_setopt($agents[$host], CURLOPT_FOLLOWLOCATION, true);
diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php
index 55b25175f..c104f7e69 100755
--- a/src/library/processors/HtmlProcessor.php
+++ b/src/library/processors/HtmlProcessor.php
@@ -272,6 +272,9 @@ class HtmlProcessor extends TextProcessor
         $links_scores = [];
         $out_links = [];
         foreach ($links as $link_url => $link_text) {
+            if (preg_match("/^Location/i", $link_text)) {
+                contnue;
+            }
             $cld = UrlParser::getCompanyLevelDomain($url);
             if (stristr($link_url, $cld) === false ||
                 trim($link_url, "/") == trim($url, "/")) {
diff --git a/src/views/elements/SearchcalloutElement.php b/src/views/elements/SearchcalloutElement.php
index 574c4c731..2cc324d28 100644
--- a/src/views/elements/SearchcalloutElement.php
+++ b/src/views/elements/SearchcalloutElement.php
@@ -61,7 +61,7 @@ class SearchcalloutElement extends Element
                 $this->view->helper("iconlink")->renderButton($load_kwiki,
                     "edit", false, "float-opposite", "", false);
             }
-            ?><?=$data['SEARCH_CALLOUT'] ?>
+            ?><?=$data['SEARCH_CALLOUT'] ?? "" ?>
             </div>
             <?php
         }
ViewGit