Adds documentation related to nmulti-queue server crawling, a=chris

Chris Pollett [2012-01-26 20:Jan:th]
Adds documentation related to nmulti-queue server crawling, a=chris
Filename
controllers/admin_controller.php
controllers/crawl_controller.php
lib/bigrams.php
lib/index_bundle_iterators/network_iterator.php
models/crawl_model.php
diff --git a/controllers/admin_controller.php b/controllers/admin_controller.php
index 0a8464e49..b789db179 100755
--- a/controllers/admin_controller.php
+++ b/controllers/admin_controller.php
@@ -884,14 +884,14 @@ class AdminController extends Controller implements CrawlConstants
                         $timestamp =
                             $this->clean($_REQUEST['load_option'], "int");
                         $seed_info = $this->crawlModel->getCrawlSeedInfo(
-                            $timestamp);
+                            $timestamp, $machine_urls);
                         $update_flag = true;
                         $no_further_changes = true;
                     } else if(isset($_REQUEST['ts'])) {
                         $timestamp =
                             $this->clean($_REQUEST['ts'], "int");
                         $seed_info = $this->crawlModel->getCrawlSeedInfo(
-                            $timestamp);
+                            $timestamp, $machine_urls);
                         $data['ts'] = $timestamp;
                     } else {
                         $seed_info = $this->crawlModel->getSeedInfo();
diff --git a/controllers/crawl_controller.php b/controllers/crawl_controller.php
index ee2c187cd..dd35959ed 100644
--- a/controllers/crawl_controller.php
+++ b/controllers/crawl_controller.php
@@ -44,7 +44,12 @@ require_once BASE_DIR."/lib/url_parser.php";


 /**
- *
+ * Controller used to manage networked installations of Yioop where
+ * there might be mutliple queue_servers and a name_server. Command
+ * sent to the nameserver web page are mapped out to queue_servers
+ * using this controller. Each method of the controller essentially
+ * mimics one method of CrawlModel and is used to proxy that information
+ * through a result web page back to the name_server.
  *
  * @author Chris Pollett
  * @package seek_quarry
@@ -58,7 +63,7 @@ class CrawlController extends Controller implements CrawlConstants
      */
     var $models = array("crawl");
     /**
-     * Only outputs JSON data so don't need view
+     * Only outputs serialized php data so don't need view
      * @var array
      */
     var $views = array();
@@ -70,11 +75,6 @@ class CrawlController extends Controller implements CrawlConstants
         "crawlStalled", "crawlStatus", "deleteCrawl", "injectUrlsCurrentCrawl",
         "getCrawlList", "combinedCrawlInfo", "getInfoTimestamp");

-    /**
-     * Number of characters from end of most recent log file to return
-     * on a log request
-     */
-    const LOG_LISTING_LEN = 100000;
     /**
      * Checks that the request seems to be coming from a legitimate fetcher then
      * determines which activity the fetcher is requesting and calls that
@@ -94,16 +94,38 @@ class CrawlController extends Controller implements CrawlConstants
         if(in_array($activity, $this->activities)) {$this->$activity();}
     }

+    /**
+     * Handles a request for whether or not the crawl is stalled on the
+     * given local server (which means no fetcher has spoken to it in a while)
+     * outputs this info back as body of the http response (url encoded,
+     * serialized php data)
+     */
+
     function crawlStalled()
     {
         echo webencode(serialize($this->crawlModel->crawlStalled()));
     }

+    /**
+     * Handles a request for the crawl status (memory use, recent fetchers
+     * crawl rate, etc) data from a remote name server
+     * and retrieves that the statistic about this that are held by the
+     * local queue server
+     * outputs this info back as body of the http response (url encoded,
+     * serialized php data)
+     */
     function crawlStatus()
     {
         echo webencode(serialize($this->crawlModel->crawlStatus()));
     }

+    /**
+     * Handles a request for information about a crawl with a given timestamp
+     * from a remote name server and retrieves statistics about this crawl
+     * that are held by the local queue server (number of pages, name, etc)
+     * outputs this info back as body of the http response (url encoded,
+     * serialized php data)
+     */
     function getInfoTimestamp()
     {
         $timestamp = 0;
@@ -114,6 +136,13 @@ class CrawlController extends Controller implements CrawlConstants
             $timestamp)));
     }

+    /**
+     * Handles a request for the crawl list (what crawl are stored on the
+     * machine) data from a remote name server and retrieves the
+     * statistic about this that are held by the local queue server
+     * outputs this info back as body of the http response (url encoded,
+     * serialized php data)
+     */
     function getCrawlList()
     {
         $return_arc_bundles = false;
@@ -127,12 +156,23 @@ class CrawlController extends Controller implements CrawlConstants
             $return_arc_bundles, $return_recrawls)));
     }

+    /**
+     * Handles a request for the combined crawl list, stalled, and status
+     * data from a remote name server and retrieves that the statistic about
+     * this that are held by the local queue server
+     * outputs this info back as body of the http response (url encoded,
+     * serialized php data)
+     */
     function combinedCrawlInfo()
     {
         $combined =  $this->crawlModel->combinedCrawlInfo();
         echo webencode(serialize($combined));
     }

+    /**
+     * Receives a request to delete a crawl from a remote name server
+     * and then deletes crawl on the local queue server
+     */
     function deleteCrawl()
     {
         if(!isset($_REQUEST["arg"]) ) {
@@ -142,6 +182,11 @@ class CrawlController extends Controller implements CrawlConstants
         $this->crawlModel->deleteCrawl($timestamp);
     }

+    /**
+     * Receives a request to inject new urls into the active
+     * crawl from a remote name server and then does this for
+     * the local queue server
+     */
     function injectUrlsCurrentCrawl()
     {
         if(!isset($_REQUEST["arg"]) || !isset($_REQUEST["num"])
@@ -154,12 +199,20 @@ class CrawlController extends Controller implements CrawlConstants
         $this->crawlModel->injectUrlsCurrentCrawl($inject_urls, NULL);
     }

+    /**
+     * Receives a request to stop a crawl from a remote name server
+     * and then stop the current crawl on the local queue server
+     */
     function sendStopCrawlMessage()
     {
         $this->crawlModel->sendStopCrawlMessage();
     }


+    /**
+     * Receives a request to start a crawl from a remote name server
+     * and then starts the crawl process on the local queue server
+     */
     function sendStartCrawlMessage()
     {

diff --git a/lib/bigrams.php b/lib/bigrams.php
index 7a5716a18..4f0857ef0 100644
--- a/lib/bigrams.php
+++ b/lib/bigrams.php
@@ -61,7 +61,6 @@ require_once BASE_DIR."/lib/phrase_parser.php";
  */
 class Bigrams
 {
-
      /**
       * Language tags and their corresponding bigram prefix
       * @var array
@@ -140,8 +139,6 @@ class Bigrams
         return $bigram_phrases;
     }

-
-
     /**
      * Creates a bloom filter file from a bigram text file. The
      * path of bigram text file used is based on the input $lang.
diff --git a/lib/index_bundle_iterators/network_iterator.php b/lib/index_bundle_iterators/network_iterator.php
index a86d91af7..8fb267881 100644
--- a/lib/index_bundle_iterators/network_iterator.php
+++ b/lib/index_bundle_iterators/network_iterator.php
@@ -102,9 +102,12 @@ class NetworkIterator extends IndexBundleIterator
     /**
      * Creates a network iterator with the given parameters.
      *
-     * @param string $query
-     * @param array $queue_servers
-     * @param string $timestamp
+     * @param string $query the query that was supplied by the end user
+     *      that we are trying to get search results for
+     * @param array $queue_servers urls of yioop instances on which documents
+     *  indexes live
+     * @param string $timestamp the timestamp of the particular current index
+     *      archive bundles that we look in for results
      */
     function __construct($query, $queue_servers, $timestamp)
     {
diff --git a/models/crawl_model.php b/models/crawl_model.php
index 913e94f62..a5eaf6286 100755
--- a/models/crawl_model.php
+++ b/models/crawl_model.php
@@ -393,93 +393,6 @@ class CrawlModel extends Model implements CrawlConstants

     }

-    /**
-     * Returns the crawl parameters that were used during a given crawl
-     *
-     * @param string $timestamp timestamp of the crawl to load the crawl
-     *      parameters of
-     * @return array  the first sites to crawl during the next crawl
-     *      restrict_by_url, allowed, disallowed_sites
-     */
-    function getCrawlSeedInfo($timestamp)
-    {
-        $dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp;
-        $seed_info = NULL;
-        if(file_exists($dir)) {
-            $info = IndexArchiveBundle::getArchiveInfo($dir);
-            $index_info = unserialize($info['DESCRIPTION']);
-            $seed_info['general']["restrict_sites_by_url"] =
-                $index_info[self::RESTRICT_SITES_BY_URL];
-            $seed_info['general']["crawl_type"] =
-                (isset($index_info[self::CRAWL_TYPE])) ?
-                $index_info[self::CRAWL_TYPE] : self::WEB_CRAWL;
-            $seed_info['general']["crawl_index"] =
-                (isset($index_info[self::CRAWL_INDEX])) ?
-                $index_info[self::CRAWL_INDEX] : '';
-            $seed_info['general']["crawl_order"] =
-                $index_info[self::CRAWL_ORDER];
-            $site_types = array(
-                "allowed_sites" => self::ALLOWED_SITES,
-                "disallowed_sites" => self::DISALLOWED_SITES,
-                "seed_sites" => self::TO_CRAWL
-            );
-            foreach($site_types as $type => $code) {
-                if(isset($index_info[$code])) {
-                    $tmp = & $index_info[$code];
-                } else {
-                    $tmp = array();
-                }
-                $seed_info[$type]['url'] =  $tmp;
-            }
-            $seed_info['meta_words'] = array();
-            if(isset($index_info[self::META_WORDS]) ) {
-                $seed_info['meta_words'] = $index_info[self::META_WORDS];
-            }
-            if(isset($index_info[self::INDEXING_PLUGINS])) {
-                $seed_info['indexing_plugins']['plugins'] =
-                    $index_info[self::INDEXING_PLUGINS];
-            }
-        }
-        return $seed_info;
-    }
-
-    /**
-     * Changes the crawl parameters of an existing crawl
-     *
-     * @param string $timestamp timestamp of the crawl to change
-     * @param array $new_info the new parameters
-     */
-    function setCrawlSeedInfo($timestamp, $new_info)
-    {
-        $dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp;
-        if(file_exists($dir)) {
-            $info = IndexArchiveBundle::getArchiveInfo($dir);
-            $index_info = unserialize($info['DESCRIPTION']);
-            if(isset($new_info['general']["restrict_sites_by_url"])) {
-                $index_info[self::RESTRICT_SITES_BY_URL] =
-                    $new_info['general']["restrict_sites_by_url"];
-            }
-            $updatable_site_info = array(
-                "allowed_sites" => self::ALLOWED_SITES,
-                "disallowed_sites" => self::DISALLOWED_SITES
-            );
-            foreach($updatable_site_info as $type => $code) {
-                if(isset($new_info[$type])) {
-                    $index_info[$code] = $new_info[$type]['url'];
-                }
-            }
-            if(isset($new_info['meta_words']) ) {
-                $index_info[self::META_WORDS] = $new_info['meta_words'];
-            }
-            if(isset($new_info['indexing_plugins']) ) {
-                $index_info[self::INDEXING_PLUGINS] =
-                    $new_info['indexing_plugins']['plugins'];
-            }
-            $info['DESCRIPTION'] = serialize($index_info);
-            IndexArchiveBundle::setArchiveInfo($dir, $info);
-        }
-    }
-

     /**
      *  Returns the initial sites that a new crawl will start with along with
@@ -594,6 +507,99 @@ EOT;
         file_put_contents(WORK_DIRECTORY."/crawl.ini", $out);
     }

+
+    /**
+     * Returns the crawl parameters that were used during a given crawl
+     *
+     * @param string $timestamp timestamp of the crawl to load the crawl
+     *      parameters of
+     * @return array  the first sites to crawl during the next crawl
+     *      restrict_by_url, allowed, disallowed_sites
+     * @param array $machine_urls an array of urls of yioop queue servers
+     *
+     */
+    function getCrawlSeedInfo($timestamp,  $machine_urls = NULL)
+    {
+        $dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp;
+        $seed_info = NULL;
+        if(file_exists($dir)) {
+            $info = IndexArchiveBundle::getArchiveInfo($dir);
+            $index_info = unserialize($info['DESCRIPTION']);
+            $seed_info['general']["restrict_sites_by_url"] =
+                $index_info[self::RESTRICT_SITES_BY_URL];
+            $seed_info['general']["crawl_type"] =
+                (isset($index_info[self::CRAWL_TYPE])) ?
+                $index_info[self::CRAWL_TYPE] : self::WEB_CRAWL;
+            $seed_info['general']["crawl_index"] =
+                (isset($index_info[self::CRAWL_INDEX])) ?
+                $index_info[self::CRAWL_INDEX] : '';
+            $seed_info['general']["crawl_order"] =
+                $index_info[self::CRAWL_ORDER];
+            $site_types = array(
+                "allowed_sites" => self::ALLOWED_SITES,
+                "disallowed_sites" => self::DISALLOWED_SITES,
+                "seed_sites" => self::TO_CRAWL
+            );
+            foreach($site_types as $type => $code) {
+                if(isset($index_info[$code])) {
+                    $tmp = & $index_info[$code];
+                } else {
+                    $tmp = array();
+                }
+                $seed_info[$type]['url'] =  $tmp;
+            }
+            $seed_info['meta_words'] = array();
+            if(isset($index_info[self::META_WORDS]) ) {
+                $seed_info['meta_words'] = $index_info[self::META_WORDS];
+            }
+            if(isset($index_info[self::INDEXING_PLUGINS])) {
+                $seed_info['indexing_plugins']['plugins'] =
+                    $index_info[self::INDEXING_PLUGINS];
+            }
+        }
+        return $seed_info;
+    }
+
+    /**
+     * Changes the crawl parameters of an existing crawl
+     *
+     * @param string $timestamp timestamp of the crawl to change
+     * @param array $new_info the new parameters
+     * @param array $machine_urls an array of urls of yioop queue servers
+     */
+    function setCrawlSeedInfo($timestamp, $new_info,  $machine_urls = NULL)
+    {
+        $dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp;
+        if(file_exists($dir)) {
+            $info = IndexArchiveBundle::getArchiveInfo($dir);
+            $index_info = unserialize($info['DESCRIPTION']);
+            if(isset($new_info['general']["restrict_sites_by_url"])) {
+                $index_info[self::RESTRICT_SITES_BY_URL] =
+                    $new_info['general']["restrict_sites_by_url"];
+            }
+            $updatable_site_info = array(
+                "allowed_sites" => self::ALLOWED_SITES,
+                "disallowed_sites" => self::DISALLOWED_SITES
+            );
+            foreach($updatable_site_info as $type => $code) {
+                if(isset($new_info[$type])) {
+                    $index_info[$code] = $new_info[$type]['url'];
+                }
+            }
+            if(isset($new_info['meta_words']) ) {
+                $index_info[self::META_WORDS] = $new_info['meta_words'];
+            }
+            if(isset($new_info['indexing_plugins']) ) {
+                $index_info[self::INDEXING_PLUGINS] =
+                    $new_info['indexing_plugins']['plugins'];
+            }
+            $info['DESCRIPTION'] = serialize($index_info);
+            IndexArchiveBundle::setArchiveInfo($dir, $info);
+        }
+    }
+
+
+
     /**
      * Get a description associated with a Web Crawl or Crawl Mix
      *
@@ -745,6 +751,7 @@ EOT;

     /**
      * Used to send a message to the queue_servers to stop a crawl
+     * @param array $machine_urls an array of urls of yioop queue servers
      */
     function sendStopCrawlMessage($machine_urls = NULL)
     {
@@ -844,7 +851,13 @@ EOT;
     }

     /**
+     * When @see getCrawlList() is used in a multi-queue_server this method
+     * used to integrate the crawl lists received by the different machines
      *
+     * @param array $list_strings serialized crawl list data from different
+     *  queue_servers
+     * @param string $data_field field of $list_strings to use for data
+     * @return array list of crawls and their meta data
      */
     function aggregateCrawlList($list_strings, $data_field = NULL)
     {
@@ -874,7 +887,15 @@ EOT;
         return $list;
     }
     /**
+     * Determines if the length of time since any of the fetchers has spoken
+     * with any of the queue_servers has exceeded CRAWL_TIME_OUT. If so,
+     * typically the caller of this method would do something such as officially
+     * stop the crawl.
      *
+     * @param array $list_strings serialized crawl list data from different
+     *   queue_servers
+     * @param array $machine_urls an array of urls of yioop queue servers
+     * @return bool whether the current crawl is stalled or not
      */
     function crawlStalled($machine_urls = NULL)
     {
@@ -884,7 +905,7 @@ EOT;
         }

         if(file_exists(CRAWL_DIR."/schedules/crawl_status.txt")) {
-            //assume if status not updated forCRAWL_TIME_OUT
+            //assume if status not updated for CRAWL_TIME_OUT
             // crawl not active (do check for both scheduler and indexer)
             if(filemtime(
                 CRAWL_DIR."/schedules/crawl_status.txt") +
@@ -903,7 +924,18 @@ EOT;
     }

     /**
+     * When @see crawlStalled() is used in a multi-queue_server this method
+     * used to integrate the stalled information received by the different
+     * machines
+     *
+     * @param array $stall_statuses contains web encoded serialized data one
+     *  one field of which has the boolean data concerning stalled statis
      *
+     * @param string $data_field field of $stall_statuses to use for data
+     *      if NULL then each element of $stall_statuses is a wen encoded
+     *      serialized boolean
+     * @return bool true if at list one queue_server has heard from one
+     *      fetcher within the time out period
      */
     function aggregateStalled($stall_statuses, $data_field = NULL)
     {
@@ -920,7 +952,13 @@ EOT;
     }

     /**
+     *  Returns data about current crawl such as DESCRIPTION, TIMESTAMP,
+     *  peak memory of various processes, most recent fetcher, most recent
+     *  urls, urls seen, urls visited, etc.
      *
+     *  @param array $machine_urls an array of urls of yioop queue servers
+     *      on which the crawl is being conducted
+     *  @return array associative array of the said data
      */
     function crawlStatus($machine_urls = NULL)
     {
@@ -972,7 +1010,15 @@ EOT;
     }

     /**
+     * When @see crawlStatus() is used in a multi-queue_server this method
+     * used to integrate the status information received by the different
+     * machines
      *
+     * @param array $status_strings
+     * @param string $data_field field of $status_strings to use for data
+     * @return array associative array of DESCRIPTION, TIMESTAMP,
+     *  peak memory of various processes, most recent fetcher, most recent
+     *  urls, urls seen, urls visited, etc.
      */
     function aggregateStatuses($status_strings, $data_field = NULL)
     {
@@ -1034,7 +1080,14 @@ EOT;
     }

     /**
+     *  This method is used to reduce the number of network requests
+     *  needed by the crawlStatus method of admin_controller. It returns
+     *  an array containing the results of the @see crawlStalled
+     *  @see crawlStatus and @see getCrawlList methods
      *
+     *  @param array $machine_urls an array of urls of yioop queue servers
+     *  @return array containing three components one for each of the three
+     *      kinds of results listed above
      */
     function combinedCrawlInfo($machine_urls = NULL)
     {
@@ -1059,7 +1112,12 @@ EOT;
     }

     /**
+     *  Add the provided urls to the schedule directory of URLs that will
+     *  be crawled
      *
+     *  @param array $inject_urls urls to be added to the schedule of
+     *      the active crawl
+     *  @param array $machine_urls an array of urls of yioop queue servers
      */
     function injectUrlsCurrentCrawl($inject_urls, $machine_urls = NULL)
     {
@@ -1107,7 +1165,23 @@ EOT;
     }

     /**
+     *  This method is invoked by other crawlModel methods when they
+     *  want to have their method performed on an array of other
+     *  Yioop instances. The results returned can then be aggregated.
+     *  The invokation sequence is
+     *  crawlModelMethodA invokes execMachine with a list of
+     *  urls of other Yioop instances. execMachine makes REST requests of
+     *  those instances of the given command and optional arguments
+     *  This request would be handled by a CrawlController which in turn
+     *  calls crawlModelMethodA on the given Yioop instance, serializes the
+     *  result and gives it back to execMachine and then back to the originally
+     *  calling function.
      *
+     *  @param string $command the CrawlModel method to invoke on the remote
+     *      Yioop instances
+     *  @param array $machine_urls machines to invoke this command on
+     *  @param string additional arguments to be passed to the remote machine
+     *  @return array a list of outputs from each machine that was called.
      */
     function execMachines($command, $machine_urls, $arg = NULL)
     {
ViewGit