Makes inject urls and change active crawl option work in multi-queue_server setting, adds more docs, a=chris

Chris Pollett [2012-01-27 23:Jan:th]

Makes inject urls and change active crawl option work in multi-queue_server setting, adds more docs, a=chris

Filename
bin/queue_server.php
controllers/admin_controller.php
controllers/crawl_controller.php
lib/utility.php
models/crawl_model.php

diff --git a/bin/queue_server.php b/bin/queue_server.php
index e81d2df6f..8dc27cfb0 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -1148,6 +1148,12 @@ class QueueServer implements CrawlConstants, Join
      * @param string $base_dir directory for of schedules
      * @param string $callback_method what method should be called to handle
      *      a schedule
+     * @param boolean $blocking this method might be called by the indexer
+     *      subcomponent when a merge tier phase is ongoing to allow for
+     *      other processing to occur. If so, we don't want a regress
+     *      where the indexer calls this code calls the indexer etc. If
+     *      the blocking flag is set then the indexer subcomponent won't
+     *      be called
      */
     function processDataFile($base_dir, $callback_method, $blocking = false)
     {
@@ -1204,6 +1210,12 @@ class QueueServer implements CrawlConstants, Join
      * Sets up the directory to look for a file of unprocessed
      * index archive data from fetchers then calls the function
      * processDataFile to process the oldest file found
+     * @param bool $blocking this method might be called by the indexer
+     *      subcomponent when a merge tier phase is ongoing to allow for
+     *      other processing to occur. If so, we don't want a regress
+     *      where the indexer calls this code calls the indexer etc. If
+     *      the blocking flag is set then the indexer subcomponent won't
+     *      be called
      */
     function processIndexData($blocking)
     {
@@ -1219,6 +1231,12 @@ class QueueServer implements CrawlConstants, Join
      *
      * @param string $file containing web pages summaries and a mini-inverted
      *      index for their content
+     * @param bool $blocking this method might be called by the indexer
+     *      subcomponent when a merge tier phase is ongoing to allow for
+     *      other processing to occur. If so, we don't want a regress
+     *      where the indexer calls this code calls the indexer etc. If
+     *      the blocking flag is set then the indexer subcomponent won't
+     *      be called
      */
     function processIndexArchive($file, $blocking)
     {
diff --git a/controllers/admin_controller.php b/controllers/admin_controller.php
index b789db179..78895f8cd 100755
--- a/controllers/admin_controller.php
+++ b/controllers/admin_controller.php
@@ -837,7 +837,8 @@ class AdminController extends Controller implements CrawlConstants
                     $data["leftorright"] =
                         (getLocaleDirection() == 'ltr') ? "right": "left";
                     $data["ELEMENT"] = "crawloptionsElement";
-                    $crawls = $this->crawlModel->getCrawlList();
+                    $crawls = $this->crawlModel->getCrawlList(false, false,
+                        $machine_urls);
                     $indexes = $this->crawlModel->getCrawlList(true, true,
                         $machine_urls);
                     $update_flag = false;
@@ -1024,11 +1025,13 @@ class AdminController extends Controller implements CrawlConstants
                     $add_message = "";
                     if(isset($_REQUEST['ts']) &&
                         isset($_REQUEST['inject_sites'])) {
+                            $timestamp = $this->clean($_REQUEST['ts'],
+                                "string");
                             $inject_urls =
                                 $this->convertStringCleanUrlsArray(
                                 $_REQUEST['inject_sites']);
                             if($this->crawlModel->injectUrlsCurrentCrawl(
-                                $inject_urls, $machine_urls)) {
+                                $timestamp, $inject_urls, $machine_urls)) {
                                 $add_message = "<br />".
                                     tl('admin_controller_urls_injected');
                             }
@@ -1036,7 +1039,7 @@ class AdminController extends Controller implements CrawlConstants
                     if($update_flag) {
                         if(isset($_REQUEST['ts'])) {
                             $this->crawlModel->setCrawlSeedInfo($timestamp,
-                                $seed_info);
+                                $seed_info, $machine_urls);
                         } else {
                             $this->crawlModel->setSeedInfo($seed_info);
                         }
diff --git a/controllers/crawl_controller.php b/controllers/crawl_controller.php
index dd35959ed..91f8a9dec 100644
--- a/controllers/crawl_controller.php
+++ b/controllers/crawl_controller.php
@@ -73,7 +73,8 @@ class CrawlController extends Controller implements CrawlConstants
      */
     var $activities = array("sendStartCrawlMessage", "sendStopCrawlMessage",
         "crawlStalled", "crawlStatus", "deleteCrawl", "injectUrlsCurrentCrawl",
-        "getCrawlList", "combinedCrawlInfo", "getInfoTimestamp");
+        "getCrawlList", "combinedCrawlInfo", "getInfoTimestamp",
+        "getCrawlSeedInfo", "setCrawlSeedInfo");

     /**
      * Checks that the request seems to be coming from a legitimate fetcher then
@@ -119,6 +120,38 @@ class CrawlController extends Controller implements CrawlConstants
         echo webencode(serialize($this->crawlModel->crawlStatus()));
     }

+    /**
+     * Handles a request for the starting parameters of a crawl of a given
+     * timestamp and retrieves that information from the bundle held by the
+     * local queue server
+     * outputs this info back as body of the http response (url encoded,
+     * serialized php data)
+     */
+    function getCrawlSeedInfo()
+    {
+        $timestamp = 0;
+        if(isset($_REQUEST["arg"]) ) {
+            $timestamp = unserialize(webdecode($_REQUEST["arg"]));
+        }
+        echo webencode(serialize($this->crawlModel->getCrawlSeedInfo(
+            $timestamp)));
+    }
+
+
+    /**
+     * Handles a request to change the parameters of a crawl of a given
+     * timestamp on the local machine (does nothing if crawl doesn't exist)
+     */
+    function setCrawlSeedInfo()
+    {
+        if(isset($_REQUEST["arg"]) ) {
+            list($timestamp, $info) = unserialize(webdecode($_REQUEST["arg"]));
+            if($timestamp && $info) {
+                $this->crawlModel->getCrawlSeedInfo($timestamp, $info);
+            }
+        }
+    }
+
     /**
      * Handles a request for information about a crawl with a given timestamp
      * from a remote name server and retrieves statistics about this crawl
@@ -193,10 +226,14 @@ class CrawlController extends Controller implements CrawlConstants
             || !isset($_REQUEST["i"])) {
             return;
         }
-        $inject_urls = unserialize(webdecode($_REQUEST["arg"]));
+        $num = $this->clean($_REQUEST["num"], "int");
+        $i = $this->clean($_REQUEST["i"], "int");
+        list($timestamp, $inject_urls) =
+            unserialize(webdecode($_REQUEST["arg"]));
         $inject_urls = partitionByHash($inject_urls,
-            NULL, $num, $i, "ParseUrl::getHost");
-        $this->crawlModel->injectUrlsCurrentCrawl($inject_urls, NULL);
+            NULL, $num, $i, "UrlParser::getHost");
+        $this->crawlModel->injectUrlsCurrentCrawl($timestamp,
+            $inject_urls, NULL);
     }

     /**
diff --git a/lib/utility.php b/lib/utility.php
index 5e1826fa7..84abd7d47 100755
--- a/lib/utility.php
+++ b/lib/utility.php
@@ -529,7 +529,22 @@ function crawlCrypt($string, $salt = NULL)
 }

 /**
- *
+ * Used by a controller to take a table and return those rows in the
+ * table that a given queue_server would be responsible for handling
+ *
+ * @param array $table an array of rows of associative arrays which
+ *      a queue_server might need to process
+ * @param string $field column of $table whose values should be used
+ *   for partitioning
+ * @param int $num_partition number of queue_servers to choose between
+ * @param int $instance the id of the particular server we are interested
+ *  in
+ * @param object $callbackfunction or static method that might be
+ *      applied to input before deciding the responsible queue_server.
+ *      For example, if input was a url we might want to get the host
+ *      before deciding on the queue_server
+ * @return array the reduced table that the $instance queue_server is
+ *      responsible for
  */
 function partitionByHash($table, $field, $num_partition, $instance,
     $callback = NULL)
@@ -547,7 +562,18 @@ function partitionByHash($table, $field, $num_partition, $instance,
 }

 /**
- *
+ * Used by a controller to say which queue_server should receive
+ * a given input
+ * @param string $input can view as a key that might be processes by a
+ *      queue_server. For example, in some cases input might be
+ *      a url and we want to determine which queue_server should be
+ *      responsible for queuing that url
+ * @param int $num_partition number of queue_servers to choose between
+ * @param object $callback function or static method that might be
+ *      applied to input before deciding the responsible queue_server.
+ *      For example, if input was a url we might want to get the host
+ *      before deciding on the queue_server
+ * @return int id of server responsible for input
  */
 function calculatePartition($input, $num_partition, $callback = NULL)
 {
diff --git a/models/crawl_model.php b/models/crawl_model.php
index a5eaf6286..a9e981c05 100755
--- a/models/crawl_model.php
+++ b/models/crawl_model.php
@@ -520,6 +520,25 @@ EOT;
      */
     function getCrawlSeedInfo($timestamp,  $machine_urls = NULL)
     {
+        if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) {
+            /* seed info should be same amongst all queue_servers that have it--
+               only start schedule differs -- however, not all queue_servers
+               necessarily have the same crawls. THus, we still query all
+               machines in case only one has it.
+             */
+            $a_list = $this->execMachines("getCrawlSeedInfo",
+                $machine_urls, serialize($timestamp));
+            if(is_array($a_list)) {
+                foreach($a_list as $elt) {
+                    $seed_info = unserialize(webdecode(
+                        $elt[self::PAGE]));
+                    if(isset($seed_info['general'])) {
+                        break;
+                    }
+                }
+            }
+            return $seed_info;
+        }
         $dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp;
         $seed_info = NULL;
         if(file_exists($dir)) {
@@ -569,6 +588,11 @@ EOT;
      */
     function setCrawlSeedInfo($timestamp, $new_info,  $machine_urls = NULL)
     {
+        if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) {
+            $params = array($timestamp, $new_info);
+            $this->execMachines("setCrawlSeedInfo",
+                $machine_urls, serialize($params));
+        }
         $dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp;
         if(file_exists($dir)) {
             $info = IndexArchiveBundle::getArchiveInfo($dir);
@@ -1048,13 +1072,17 @@ EOT;
                     $status[$field] += $a_status[$field];
                 }
             }
-            if(isset($a_status["CRAWL_TIME"]) && $a_status["CRAWL_TIME"] >
+            if(isset($a_status["CRAWL_TIME"]) && $a_status["CRAWL_TIME"] >=
                 $status['CRAWL_TIME']) {
                 $status['CRAWL_TIME'] = $a_status["CRAWL_TIME"];
                 $text_fields = array("DESCRIPTION", "MOST_RECENT_FETCHER");
                 foreach($text_fields as $field) {
                     if(isset($a_status[$field])) {
-                        $status[$field] = $a_status[$field];
+                        if($status[$field] == "" ||
+                            in_array($status[$field], array("BEGIN_CRAWL",
+                                "RESUME_CRAWL") )) {
+                            $status[$field] = $a_status[$field];
+                        }
                     }
                 }
             }
@@ -1115,15 +1143,17 @@ EOT;
      *  Add the provided urls to the schedule directory of URLs that will
      *  be crawled
      *
+     *  @param string $timestamp Unix timestamp of crawl to add to schedule of
      *  @param array $inject_urls urls to be added to the schedule of
      *      the active crawl
      *  @param array $machine_urls an array of urls of yioop queue servers
      */
-    function injectUrlsCurrentCrawl($inject_urls, $machine_urls = NULL)
+    function injectUrlsCurrentCrawl($timestamp, $inject_urls,
+        $machine_urls = NULL)
     {
         if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) {
             $this->execMachines("injectUrlsCurrentCrawl", $machine_urls,
-                serialize($inject_urls));
+                serialize(array($timestamp, $inject_urls)));
             return;
         }

ViewGit