Added documentation to queue_server.php, a=chris

Chris Pollett [2011-08-12 05:Aug:th]

Added documentation to queue_server.php, a=chris

Filename
bin/queue_server.php

diff --git a/bin/queue_server.php b/bin/queue_server.php
index f64e6107e..4242f468a 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -1074,7 +1074,7 @@ class QueueServer implements CrawlConstants
             foreach($to_crawl_sites as $triple) {
                 $url = & $triple[0];
                 $weight = $triple[1];
-                $this->web_queue->addSeenUrlFilter($triple[2]);
+                $this->web_queue->addSeenUrlFilter($triple[2]); //add for dedup
                 unset($triple[2]); // so triple is now a pair
                 $host_url = UrlParser::getHost($url);
                 $host_with_robots = $host_url."/robots.txt";
@@ -1111,6 +1111,10 @@ class QueueServer implements CrawlConstants

             crawlLog("C..");
             $start_time = microtime();
+            /*
+                 adding urls to queue involves disk contains and adjust do not
+                 so group and do last
+             */
             $this->web_queue->addUrlsQueue($added_pairs);

         }
@@ -1251,24 +1255,36 @@ class QueueServer implements CrawlConstants
         $start_time = microtime();

         $fh = $this->web_queue->openUrlArchive();
+        /*
+            $delete - array of items we will delete from the queue after
+                we have selected all of the items for fetch batch
+            $sites - array of urls for fetch batch indices in this array we'll
+                call slots. Crawled-delayed host urls are spaced by a certain
+                number of slots
+        */
         while ($i <= $count && $fetch_size < MAX_FETCH_SIZE) {
+
+            //look in queue for url and its weight
             $tmp = $this->web_queue->peekQueue($i, $fh);
+
             list($url, $weight) = $tmp;
+
+            // if queue error remove entry any loop
             if($tmp === false || strcmp($url, "LOOKUP ERROR") == 0) {
-                $this->web_queue->to_crawl_queue->poll($i);
+                $delete_urls[$i] = $url;
                 crawlLog("Removing lookup error index");
                 $i++;
                 continue;
             }

             $host_url = UrlParser::getHost($url);
-
+
+            //if $url is a robots.txt url see if we need to schedule or not
             if(strcmp($host_url."/robots.txt", $url) == 0) {
                 if($this->web_queue->containsGotRobotTxt($host_url)) {
                     $delete_urls[$i] = $url;
                     $i++;
                 } else {
-
                     $next_slot = $this->getEarliestSlot($current_crawl_index,
                         $sites);

@@ -1288,13 +1304,13 @@ class QueueServer implements CrawlConstants
                 }
                 continue;
             }
-
+
+            //Now handle the non-robots.txt url case
             $robots_okay = true;

             if($this->web_queue->containsGotRobotTxt($host_url)) {
                 $host_paths = UrlParser::getHostPaths($url);

-
                 foreach($host_paths as $host_path) {
                     if($this->web_queue->containsDisallowedRobot($host_path)) {
                         $robots_okay = false;
@@ -1389,7 +1405,7 @@ class QueueServer implements CrawlConstants
         if(isset($sites) && count($sites) > 0 ) {
             $dummy_slot = array(self::DUMMY, 0.0, 0);
             /* dummy's are used for crawl delays of sites with longer delays
-               when we don't have much else to crawl
+               when we don't have much else to crawl.
              */
             $cnt = 0;
             for($j = 0; $j < MAX_FETCH_SIZE; $j++) {

ViewGit