Tweaks the way info:hash(url) queries are performed so works, fixes a bug in how redirects of robots.txt files are handled so that robot paths added to all redirected through hots, a=chris

Chris Pollett [2017-01-22 03:Jan:nd]

Tweaks the way info:hash(url) queries are performed so works, fixes a bug in how redirects of robots.txt files are handled so that robot paths added to all redirected through hots, a=chris

Filename
src/controllers/SearchController.php
src/executables/Fetcher.php
src/executables/QueueServer.php
src/library/UrlParser.php
src/models/PhraseModel.php

diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php
index 4794ee6c7..68846421b 100755
--- a/src/controllers/SearchController.php
+++ b/src/controllers/SearchController.php
@@ -724,7 +724,6 @@ class SearchController extends Controller implements CrawlConstants
                     $tmp_meta_words);
         }
         $crawl_model->index_name = $index_name;
-
         $original_query = $query;
         list($query, $raw, $use_network, $use_cache_if_possible,
             $guess_semantics) =
@@ -2073,4 +2072,4 @@ class SearchController extends Controller implements CrawlConstants
         $script->setAttributeNS("","src", C\NAME_SERVER."/scripts/history.js");
         $node->appendChild($script);
     }
-}
\ No newline at end of file
+}
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 2755368b5..0c3d1e198 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -158,6 +158,12 @@ class Fetcher implements CrawlConstants
      * @var array
      */
     public $disallowed_sites;
+    /**
+     * Microtime used to look up cache $allowed_sites and $disallowed_sites
+     * filtering data structures
+     * @var int
+     */
+    public $allow_disallow_cache_time;
     /**
      * Holds the parsed page rules which will be applied to document summaries
      * before finally storing and indexing them
@@ -432,6 +438,7 @@ class Fetcher implements CrawlConstants
         $this->restrict_sites_by_url = false;
         $this->allowed_sites = [];
         $this->disallowed_sites = [];
+        $this->allow_disallow_cache_time = microtime(true);
         $this->page_rule_parser = null;
         $this->video_sources = [];
         $this->hosts_with_errors = [];
@@ -674,7 +681,6 @@ class Fetcher implements CrawlConstants
                 case self::WEB_CRAWL:
                     $downloaded_pages = $this->downloadPagesWebCrawl();
                     break;
-
                 case self::ARCHIVE_CRAWL:
                     if (isset($info[self::ARC_DATA])) {
                         $downloaded_pages = $info[self::ARC_DATA];
@@ -694,11 +700,9 @@ class Fetcher implements CrawlConstants
             }
             L\crawlLog("Number of summarized pages ".
                 count($summarized_site_pages));
-
             $force_send = (isset($info[self::END_ITERATOR]) &&
                 $info[self::END_ITERATOR]) ? true : false;
             $this->updateFoundSites($summarized_site_pages, $force_send);
-
             $sleep_time = max(0, ceil($this->minimum_fetch_loop_time
                 - L\changeInMicrotime($start_time)));
             if ($sleep_time > 0) {
@@ -707,7 +711,6 @@ class Fetcher implements CrawlConstants
                 sleep($sleep_time);
             }
         } //end while
-
         L\crawlLog("Fetcher shutting down!!");
     }
     /**
@@ -1125,11 +1128,9 @@ class Fetcher implements CrawlConstants
         L\crawlLog("Checking  $queue_server for a new schedule.");
         // hosts with error counts cleared with each schedule
         $this->hosts_with_errors = [];
-
         $start_time = microtime(true);
         $time = time();
         $session = md5($time . C\AUTH_KEY);
-
         $request =
             $queue_server."?c=fetch&a=schedule&time=$time&session=$session".
             "&robot_instance=".$prefix . C\ROBOT_INSTANCE.
@@ -1145,7 +1146,6 @@ class Fetcher implements CrawlConstants
         $tok = strtok($info_string, "\n");
         $info = unserialize(base64_decode($tok));
         $this->setCrawlParamsFromArray($info);
-
         if (isset($info[self::SITES])) {
             $tok = strtok("\n"); //skip meta info
             $this->to_crawl = [];
@@ -1168,7 +1168,6 @@ class Fetcher implements CrawlConstants
                 self::fetch_crawl_info."{$this->crawl_time}.txt",
                 serialize($info));
         }
-
         L\crawlLog("Time to check Scheduler ".L\changeInMicrotime($start_time));
         return $info;
     }
@@ -2021,6 +2020,7 @@ class Fetcher implements CrawlConstants
                 }
             }
         }
+        $this->allow_disallow_cache_time = microtime(true);
         for ($i = 0; $i < $count_again; $i++) {
             L\crawlTimeoutLog("..still culling to crawl again urls. Examining ".
                 "location %s in queue of %s.", $i, $count);
@@ -2039,6 +2039,40 @@ class Fetcher implements CrawlConstants
         L\crawlLog("...Removed $k cullable URLS  from to crawl lists in time: ".
             L\changeInMicrotime($start_time));
     }
+    /**
+     * Checks if url belongs to a list of sites that are allowed to be
+     * crawled and that the file type is crawlable
+     *
+     * @param string $url url to check
+     * @return bool whether is allowed to be crawled or not
+     */
+    public function allowedToCrawlSite($url)
+    {
+        $doc_type = UrlParser::getDocumentType($url);
+        if (!in_array($doc_type, $this->all_file_types)) {
+            $doc_type = "unknown";
+        }
+        if (!in_array($doc_type, $this->indexed_file_types)) {
+            return false;
+        }
+        if ($this->restrict_sites_by_url) {
+           return UrlParser::urlMemberSiteArray($url, $this->allowed_sites,
+                "a" . $this->allow_disallow_cache_time);
+        }
+        return true;
+    }
+    /**
+     * Checks if url belongs to a list of sites that aren't supposed to be
+     * crawled
+     *
+     * @param string $url url to check
+     * @return bool whether is shouldn't be crawled
+     */
+    public function disallowedToCrawlSite($url)
+    {
+        return UrlParser::urlMemberSiteArray($url, $this->disallowed_sites,
+            "d" . $this->allow_disallow_cache_time);
+    }
     /**
      * Page processors are allowed to extract up to MAX_LINKS_TO_EXTRACT
      * This method attempts to cull from the doc_info struct the
@@ -2179,21 +2213,33 @@ class Fetcher implements CrawlConstants
         L\crawlLog("  Updating Found Sites Array...");
         for ($i = 0; $i < count($sites); $i++) {
             $site = $sites[$i];
-            if (!isset($site[self::URL])) continue;
+            if (!isset($site[self::URL])) {
+                continue;
+            }
             $host = UrlParser::getHost($site[self::URL]);
             if (isset($site[self::ROBOT_PATHS])) {
-                $this->found_sites[self::ROBOT_TXT][$host][self::IP_ADDRESSES] =
-                    $site[self::IP_ADDRESSES];
                 if ($site[self::IP_ADDRESSES] == ["0.0.0.0"]) {
-                    //probably couldn't find site so this will block from crawl
+                    /* probably couldn't find site so this will block
+                        from crawl
+                    */
                     $site[self::ROBOT_PATHS][self::DISALLOWED_SITES] =
                         ["/"];
                 }
-                $this->found_sites[self::ROBOT_TXT][$host][self::ROBOT_PATHS] =
-                    $site[self::ROBOT_PATHS];
-                if (isset($site[self::CRAWL_DELAY])) {
-                    $this->found_sites[self::ROBOT_TXT][$host][
-                        self::CRAWL_DELAY] = $site[self::CRAWL_DELAY];
+                //set same robots.txt for all redirects went through
+                $locations = [$host];
+                if (!empty($site[self::LOCATION])) {
+                    $locations = array_merge($locations, $site[self::LOCATION]);
+                }
+                foreach ($locations as $location) {
+                    $h = UrlParser::getHost($location);
+                    $this->found_sites[self::ROBOT_TXT][$h][self::IP_ADDRESSES]=
+                        $site[self::IP_ADDRESSES];
+                    $this->found_sites[self::ROBOT_TXT][$h][self::ROBOT_PATHS] =
+                        $site[self::ROBOT_PATHS];
+                    if (isset($site[self::CRAWL_DELAY])) {
+                        $this->found_sites[self::ROBOT_TXT][$h][
+                            self::CRAWL_DELAY] = $site[self::CRAWL_DELAY];
+                    }
                 }
                 if (isset($site[self::LINKS])
                     && $this->crawl_type == self::WEB_CRAWL) {
@@ -2251,7 +2297,6 @@ class Fetcher implements CrawlConstants
         } // end for
         L\crawlLog("  Done Update Found Sites Array Time ".
             L\changeInMicrotime($start_time));
-
         if ($force_send || ($this->crawl_type == self::WEB_CRAWL &&
             count($this->to_crawl) <= 0 && count($this->to_crawl_again) <= 0) ||
                 (isset($this->found_sites[self::SEEN_URLS]) &&
@@ -2348,13 +2393,10 @@ class Fetcher implements CrawlConstants
         $current_server = $this->current_server;
         $queue_server = $this->queue_servers[$current_server];
         L\crawlLog("Updating machine: ".$queue_server);
-
         $prefix = $this->fetcher_num."-";
-
         if (count($this->to_crawl) <= 0) {
             $schedule_time = $this->schedule_time;
         }
-
         /*
             In what follows as we generate post data we delete stuff
             from $this->found_sites, to try to minimize our memory
@@ -2367,7 +2409,6 @@ class Fetcher implements CrawlConstants
             'robot_instance' => $prefix . C\ROBOT_INSTANCE, 'data' => '',
             'check_crawl_time' => $this->check_crawl_time,
             'crawl_type' => $this->crawl_type];
-
         //handle robots.txt data
         if (isset($this->found_sites[self::ROBOT_TXT])) {
             $data = L\webencode(
@@ -2379,13 +2420,11 @@ class Fetcher implements CrawlConstants
             $byte_counts["TOTAL"] += $bytes_robot;
             $byte_counts["ROBOT"] = $bytes_robot;
         }
-
         //handle cache validation data
         if (isset($this->found_sites[self::CACHE_PAGE_VALIDATION_DATA])) {
             $cache_page_validation_data = L\webencode(
                 gzcompress(serialize(
                     $this->found_sites[self::CACHE_PAGE_VALIDATION_DATA])));
-
             unset($this->found_sites[self::CACHE_PAGE_VALIDATION_DATA]);
             $bytes_cache_page_validation = strlen($cache_page_validation_data);
             $post_data['data'] .= $cache_page_validation_data;
@@ -2402,7 +2441,6 @@ class Fetcher implements CrawlConstants
                 $this->found_sites[self::TO_CRAWL][$current_server];
         }
         unset($this->found_sites[self::TO_CRAWL][$current_server]);
-
         $seen_cnt = 0;
         if (isset($this->found_sites[self::SEEN_URLS]) &&
             ($seen_cnt = count($this->found_sites[self::SEEN_URLS])) > 0 ) {
@@ -2411,7 +2449,7 @@ class Fetcher implements CrawlConstants
                 $hash_seen_urls[] =
                     L\crawlHash($site[self::URL], true);
             }
-            $schedule_data[self::HASH_SEEN_URLS] = & $hash_seen_urls;
+            $schedule_data[self::HASH_SEEN_URLS] = $hash_seen_urls;
             unset($hash_seen_urls);
         }
         if (!empty($schedule_data)) {
@@ -2796,7 +2834,6 @@ class Fetcher implements CrawlConstants
                     $link_rank = false;
                 }
             }
-
             $num_queue_servers = count($this->queue_servers);
             if (isset($site[self::USER_RANKS]) &&
                 count($site[self::USER_RANKS]) > 0) {
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 296a0a6cb..cdf8e649e 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -88,8 +88,8 @@ class QueueServer implements CrawlConstants, Join
      */
     public $disallowed_sites;
     /**
-     * Used to cache $allowed_sites and $disallowed_sites filtering data
-     * structures
+     * Microtime used to look up cache $allowed_sites and $disallowed_sites
+     * filtering data structures
      * @var int
      */
     public $allow_disallow_cache_time;
@@ -281,7 +281,6 @@ class QueueServer implements CrawlConstants, Join
         $this->indexed_file_types = PageProcessor::$indexed_file_types;
         $this->all_file_types = PageProcessor::$indexed_file_types;
         $this->most_recent_fetcher = "No Fetcher has spoken with me";
-
         //the next values will be set for real in startCrawl
         $this->crawl_order = self::PAGE_IMPORTANCE;
         $this->summarizer_option = self::CENTROID_SUMMARIZER;
@@ -2003,9 +2002,7 @@ class QueueServer implements CrawlConstants, Join
                 }
             }
             $this->web_queue->notifyFlush();
-
             L\crawlLog(" time: ".L\changeInMicrotime($start_time));
-
             L\crawlLog("C.. Scheduler: Add urls to queue");
             $start_time = microtime(true);
             /*
@@ -2296,7 +2293,6 @@ class QueueServer implements CrawlConstants, Join
                     }
                     $delay = $this->web_queue->getCrawlDelay($host_url);
                 }
-
                 if (!$this->withinQuota($url)) {
                     //we've not allowed to schedule $url till next hour
                     $delete_urls[$i] = $url;
@@ -2320,7 +2316,6 @@ class QueueServer implements CrawlConstants, Join
                             $hash_host;
                         $request_batches_per_delay =
                             ceil($delay/$time_per_request_guess);
-
                         if (!isset($crawl_delay_hosts[$hash_host])) {
                             $next_earliest_slot = $current_crawl_index;
                             $crawl_delay_hosts[$hash_host]= $next_earliest_slot;
@@ -2591,7 +2586,7 @@ class QueueServer implements CrawlConstants, Join
         }
         if ($this->restrict_sites_by_url) {
            return UrlParser::urlMemberSiteArray($url, $this->allowed_sites,
-                "a".$this->allow_disallow_cache_time);
+                "a" . $this->allow_disallow_cache_time);
         }
         return true;
     }
@@ -2605,7 +2600,7 @@ class QueueServer implements CrawlConstants, Join
     public function disallowedToCrawlSite($url)
     {
         return UrlParser::urlMemberSiteArray($url, $this->disallowed_sites,
-            "d".$this->allow_disallow_cache_time);
+            "d" . $this->allow_disallow_cache_time);
     }
     /**
      * Checks if the $url is from a site which has an hourly quota to download.
@@ -2645,4 +2640,4 @@ if (!C\nsdefined("UNIT_TEST_MODE")) {
      */
     $queue_server =  new QueueServer();
     $queue_server->start();
-}
\ No newline at end of file
+}
diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php
index f28da4fbf..c1836247d 100755
--- a/src/library/UrlParser.php
+++ b/src/library/UrlParser.php
@@ -243,10 +243,8 @@ class UrlParser
     public static function getHost($url, $with_login_and_port = true)
     {
         $url_parts = @parse_url($url);
-
         if (!isset($url_parts['scheme']) ) {return false;}
         $host_url = $url_parts['scheme'].'://';
-
         //handles common typo http:/yahoo.com rather than http://yahoo.com
         if (!isset($url_parts['host'])) {
             if (isset($url_parts['path'])) {
@@ -800,7 +798,9 @@ class UrlParser
         $name, $return_rule = false)
     {
         static $cache = [];
-        if (!is_array($site_array)) {return false;}
+        if (!is_array($site_array)) {
+            return false;
+        }
         if (!isset($cache[$name])) {
             if (count($cache) > 100) {
                 $cache = [];
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index ee93a6637..bdb2311d4 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -77,6 +77,10 @@ class PhraseModel extends ParallelModel
      * @var string
      */
     public $program_indicator;
+    /**
+     * Length of info hash record phrse
+     */
+     const INFO_HASH_LEN = 16;
     /**
      * Number of pages to cache in one go in memcache or filecache
      * Size chosen based on 1MB max object size for memcache or filecache
@@ -320,7 +324,10 @@ class PhraseModel extends ParallelModel
                 $prs_cnt++;
             }
             $cache_results = false;
-            $phrase = mb_strtolower($phrase);
+            if (mb_strlen($phrase) > self::INFO_HASH_LEN ||
+                mb_substr($phrase, 0, 5) != "info:") {
+                $phrase = mb_strtolower($phrase);
+            }
             if (C\nsdefined('USE_CACHE') && C\USE_CACHE &&
                 $save_timestamp == "" && $use_cache_if_allowed && !$network) {
                 $cache_results = self::$cache->get($phrase . $this->index_name);
@@ -452,7 +459,6 @@ class PhraseModel extends ParallelModel
         } elseif (isset($results['PAGES'])) {
             $results['TOTAL_ROWS'] = count($results['PAGES']);
         }
-
         if ($format) {
             if (count($format_words) == 0) {
                 $format_words = null;
@@ -850,10 +856,13 @@ class PhraseModel extends ParallelModel
             }
             $phrase = $this->beginMatch($phrase, "www.", "site:www.");
             $phrase = $this->beginMatch($phrase, "http:", "site:http:");
-            $phrase = $this->beginMatch($phrase, "info:", "info:http://", "/",
-                ["/"]);
-            $phrase = $this->beginMatch($phrase, "info:", "info:http://", "",
-                ["http"]);
+            // only rewrite info if longer than hash info record length
+            if($len > self::INFO_HASH_LEN) {
+                $phrase = $this->beginMatch($phrase, "info:", "info:http://",
+                    "/", ["/"]);
+                $phrase = $this->beginMatch($phrase, "info:", "info:http://",
+                    "", ["http"]);
+            }
         }
         $tag = L\guessLocaleFromString($phrase);
         if (isset($this->programming_language_map[$tag])) {

ViewGit