diff --git a/bin/queue_server.php b/bin/queue_server.php
index e81d2df6f..8dc27cfb0 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -1148,6 +1148,12 @@ class QueueServer implements CrawlConstants, Join
* @param string $base_dir directory for of schedules
* @param string $callback_method what method should be called to handle
* a schedule
+ * @param boolean $blocking this method might be called by the indexer
+ * subcomponent when a merge tier phase is ongoing to allow for
+ * other processing to occur. If so, we don't want a regress
+ * where the indexer calls this code calls the indexer etc. If
+ * the blocking flag is set then the indexer subcomponent won't
+ * be called
*/
function processDataFile($base_dir, $callback_method, $blocking = false)
{
@@ -1204,6 +1210,12 @@ class QueueServer implements CrawlConstants, Join
* Sets up the directory to look for a file of unprocessed
* index archive data from fetchers then calls the function
* processDataFile to process the oldest file found
+ * @param bool $blocking this method might be called by the indexer
+ * subcomponent when a merge tier phase is ongoing to allow for
+ * other processing to occur. If so, we don't want a regress
+ * where the indexer calls this code calls the indexer etc. If
+ * the blocking flag is set then the indexer subcomponent won't
+ * be called
*/
function processIndexData($blocking)
{
@@ -1219,6 +1231,12 @@ class QueueServer implements CrawlConstants, Join
*
* @param string $file containing web pages summaries and a mini-inverted
* index for their content
+ * @param bool $blocking this method might be called by the indexer
+ * subcomponent when a merge tier phase is ongoing to allow for
+ * other processing to occur. If so, we don't want a regress
+ * where the indexer calls this code calls the indexer etc. If
+ * the blocking flag is set then the indexer subcomponent won't
+ * be called
*/
function processIndexArchive($file, $blocking)
{
diff --git a/controllers/admin_controller.php b/controllers/admin_controller.php
index b789db179..78895f8cd 100755
--- a/controllers/admin_controller.php
+++ b/controllers/admin_controller.php
@@ -837,7 +837,8 @@ class AdminController extends Controller implements CrawlConstants
$data["leftorright"] =
(getLocaleDirection() == 'ltr') ? "right": "left";
$data["ELEMENT"] = "crawloptionsElement";
- $crawls = $this->crawlModel->getCrawlList();
+ $crawls = $this->crawlModel->getCrawlList(false, false,
+ $machine_urls);
$indexes = $this->crawlModel->getCrawlList(true, true,
$machine_urls);
$update_flag = false;
@@ -1024,11 +1025,13 @@ class AdminController extends Controller implements CrawlConstants
$add_message = "";
if(isset($_REQUEST['ts']) &&
isset($_REQUEST['inject_sites'])) {
+ $timestamp = $this->clean($_REQUEST['ts'],
+ "string");
$inject_urls =
$this->convertStringCleanUrlsArray(
$_REQUEST['inject_sites']);
if($this->crawlModel->injectUrlsCurrentCrawl(
- $inject_urls, $machine_urls)) {
+ $timestamp, $inject_urls, $machine_urls)) {
$add_message = "<br />".
tl('admin_controller_urls_injected');
}
@@ -1036,7 +1039,7 @@ class AdminController extends Controller implements CrawlConstants
if($update_flag) {
if(isset($_REQUEST['ts'])) {
$this->crawlModel->setCrawlSeedInfo($timestamp,
- $seed_info);
+ $seed_info, $machine_urls);
} else {
$this->crawlModel->setSeedInfo($seed_info);
}
diff --git a/controllers/crawl_controller.php b/controllers/crawl_controller.php
index dd35959ed..91f8a9dec 100644
--- a/controllers/crawl_controller.php
+++ b/controllers/crawl_controller.php
@@ -73,7 +73,8 @@ class CrawlController extends Controller implements CrawlConstants
*/
var $activities = array("sendStartCrawlMessage", "sendStopCrawlMessage",
"crawlStalled", "crawlStatus", "deleteCrawl", "injectUrlsCurrentCrawl",
- "getCrawlList", "combinedCrawlInfo", "getInfoTimestamp");
+ "getCrawlList", "combinedCrawlInfo", "getInfoTimestamp",
+ "getCrawlSeedInfo", "setCrawlSeedInfo");
/**
* Checks that the request seems to be coming from a legitimate fetcher then
@@ -119,6 +120,38 @@ class CrawlController extends Controller implements CrawlConstants
echo webencode(serialize($this->crawlModel->crawlStatus()));
}
+ /**
+ * Handles a request for the starting parameters of a crawl of a given
+ * timestamp and retrieves that information from the bundle held by the
+ * local queue server
+ * outputs this info back as body of the http response (url encoded,
+ * serialized php data)
+ */
+ function getCrawlSeedInfo()
+ {
+ $timestamp = 0;
+ if(isset($_REQUEST["arg"]) ) {
+ $timestamp = unserialize(webdecode($_REQUEST["arg"]));
+ }
+ echo webencode(serialize($this->crawlModel->getCrawlSeedInfo(
+ $timestamp)));
+ }
+
+
+ /**
+ * Handles a request to change the parameters of a crawl of a given
+ * timestamp on the local machine (does nothing if crawl doesn't exist)
+ */
+ function setCrawlSeedInfo()
+ {
+ if(isset($_REQUEST["arg"]) ) {
+ list($timestamp, $info) = unserialize(webdecode($_REQUEST["arg"]));
+ if($timestamp && $info) {
+ $this->crawlModel->getCrawlSeedInfo($timestamp, $info);
+ }
+ }
+ }
+
/**
* Handles a request for information about a crawl with a given timestamp
* from a remote name server and retrieves statistics about this crawl
@@ -193,10 +226,14 @@ class CrawlController extends Controller implements CrawlConstants
|| !isset($_REQUEST["i"])) {
return;
}
- $inject_urls = unserialize(webdecode($_REQUEST["arg"]));
+ $num = $this->clean($_REQUEST["num"], "int");
+ $i = $this->clean($_REQUEST["i"], "int");
+ list($timestamp, $inject_urls) =
+ unserialize(webdecode($_REQUEST["arg"]));
$inject_urls = partitionByHash($inject_urls,
- NULL, $num, $i, "ParseUrl::getHost");
- $this->crawlModel->injectUrlsCurrentCrawl($inject_urls, NULL);
+ NULL, $num, $i, "UrlParser::getHost");
+ $this->crawlModel->injectUrlsCurrentCrawl($timestamp,
+ $inject_urls, NULL);
}
/**
diff --git a/lib/utility.php b/lib/utility.php
index 5e1826fa7..84abd7d47 100755
--- a/lib/utility.php
+++ b/lib/utility.php
@@ -529,7 +529,22 @@ function crawlCrypt($string, $salt = NULL)
}
/**
- *
+ * Used by a controller to take a table and return those rows in the
+ * table that a given queue_server would be responsible for handling
+ *
+ * @param array $table an array of rows of associative arrays which
+ * a queue_server might need to process
+ * @param string $field column of $table whose values should be used
+ * for partitioning
+ * @param int $num_partition number of queue_servers to choose between
+ * @param int $instance the id of the particular server we are interested
+ * in
+ * @param object $callbackfunction or static method that might be
+ * applied to input before deciding the responsible queue_server.
+ * For example, if input was a url we might want to get the host
+ * before deciding on the queue_server
+ * @return array the reduced table that the $instance queue_server is
+ * responsible for
*/
function partitionByHash($table, $field, $num_partition, $instance,
$callback = NULL)
@@ -547,7 +562,18 @@ function partitionByHash($table, $field, $num_partition, $instance,
}
/**
- *
+ * Used by a controller to say which queue_server should receive
+ * a given input
+ * @param string $input can view as a key that might be processes by a
+ * queue_server. For example, in some cases input might be
+ * a url and we want to determine which queue_server should be
+ * responsible for queuing that url
+ * @param int $num_partition number of queue_servers to choose between
+ * @param object $callback function or static method that might be
+ * applied to input before deciding the responsible queue_server.
+ * For example, if input was a url we might want to get the host
+ * before deciding on the queue_server
+ * @return int id of server responsible for input
*/
function calculatePartition($input, $num_partition, $callback = NULL)
{
diff --git a/models/crawl_model.php b/models/crawl_model.php
index a5eaf6286..a9e981c05 100755
--- a/models/crawl_model.php
+++ b/models/crawl_model.php
@@ -520,6 +520,25 @@ EOT;
*/
function getCrawlSeedInfo($timestamp, $machine_urls = NULL)
{
+ if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) {
+ /* seed info should be same amongst all queue_servers that have it--
+ only start schedule differs -- however, not all queue_servers
+ necessarily have the same crawls. THus, we still query all
+ machines in case only one has it.
+ */
+ $a_list = $this->execMachines("getCrawlSeedInfo",
+ $machine_urls, serialize($timestamp));
+ if(is_array($a_list)) {
+ foreach($a_list as $elt) {
+ $seed_info = unserialize(webdecode(
+ $elt[self::PAGE]));
+ if(isset($seed_info['general'])) {
+ break;
+ }
+ }
+ }
+ return $seed_info;
+ }
$dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp;
$seed_info = NULL;
if(file_exists($dir)) {
@@ -569,6 +588,11 @@ EOT;
*/
function setCrawlSeedInfo($timestamp, $new_info, $machine_urls = NULL)
{
+ if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) {
+ $params = array($timestamp, $new_info);
+ $this->execMachines("setCrawlSeedInfo",
+ $machine_urls, serialize($params));
+ }
$dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp;
if(file_exists($dir)) {
$info = IndexArchiveBundle::getArchiveInfo($dir);
@@ -1048,13 +1072,17 @@ EOT;
$status[$field] += $a_status[$field];
}
}
- if(isset($a_status["CRAWL_TIME"]) && $a_status["CRAWL_TIME"] >
+ if(isset($a_status["CRAWL_TIME"]) && $a_status["CRAWL_TIME"] >=
$status['CRAWL_TIME']) {
$status['CRAWL_TIME'] = $a_status["CRAWL_TIME"];
$text_fields = array("DESCRIPTION", "MOST_RECENT_FETCHER");
foreach($text_fields as $field) {
if(isset($a_status[$field])) {
- $status[$field] = $a_status[$field];
+ if($status[$field] == "" ||
+ in_array($status[$field], array("BEGIN_CRAWL",
+ "RESUME_CRAWL") )) {
+ $status[$field] = $a_status[$field];
+ }
}
}
}
@@ -1115,15 +1143,17 @@ EOT;
* Add the provided urls to the schedule directory of URLs that will
* be crawled
*
+ * @param string $timestamp Unix timestamp of crawl to add to schedule of
* @param array $inject_urls urls to be added to the schedule of
* the active crawl
* @param array $machine_urls an array of urls of yioop queue servers
*/
- function injectUrlsCurrentCrawl($inject_urls, $machine_urls = NULL)
+ function injectUrlsCurrentCrawl($timestamp, $inject_urls,
+ $machine_urls = NULL)
{
if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) {
$this->execMachines("injectUrlsCurrentCrawl", $machine_urls,
- serialize($inject_urls));
+ serialize(array($timestamp, $inject_urls)));
return;
}