diff --git a/controllers/admin_controller.php b/controllers/admin_controller.php
index 0a8464e49..b789db179 100755
--- a/controllers/admin_controller.php
+++ b/controllers/admin_controller.php
@@ -884,14 +884,14 @@ class AdminController extends Controller implements CrawlConstants
$timestamp =
$this->clean($_REQUEST['load_option'], "int");
$seed_info = $this->crawlModel->getCrawlSeedInfo(
- $timestamp);
+ $timestamp, $machine_urls);
$update_flag = true;
$no_further_changes = true;
} else if(isset($_REQUEST['ts'])) {
$timestamp =
$this->clean($_REQUEST['ts'], "int");
$seed_info = $this->crawlModel->getCrawlSeedInfo(
- $timestamp);
+ $timestamp, $machine_urls);
$data['ts'] = $timestamp;
} else {
$seed_info = $this->crawlModel->getSeedInfo();
diff --git a/controllers/crawl_controller.php b/controllers/crawl_controller.php
index ee2c187cd..dd35959ed 100644
--- a/controllers/crawl_controller.php
+++ b/controllers/crawl_controller.php
@@ -44,7 +44,12 @@ require_once BASE_DIR."/lib/url_parser.php";
/**
- *
+ * Controller used to manage networked installations of Yioop where
+ * there might be mutliple queue_servers and a name_server. Command
+ * sent to the nameserver web page are mapped out to queue_servers
+ * using this controller. Each method of the controller essentially
+ * mimics one method of CrawlModel and is used to proxy that information
+ * through a result web page back to the name_server.
*
* @author Chris Pollett
* @package seek_quarry
@@ -58,7 +63,7 @@ class CrawlController extends Controller implements CrawlConstants
*/
var $models = array("crawl");
/**
- * Only outputs JSON data so don't need view
+ * Only outputs serialized php data so don't need view
* @var array
*/
var $views = array();
@@ -70,11 +75,6 @@ class CrawlController extends Controller implements CrawlConstants
"crawlStalled", "crawlStatus", "deleteCrawl", "injectUrlsCurrentCrawl",
"getCrawlList", "combinedCrawlInfo", "getInfoTimestamp");
- /**
- * Number of characters from end of most recent log file to return
- * on a log request
- */
- const LOG_LISTING_LEN = 100000;
/**
* Checks that the request seems to be coming from a legitimate fetcher then
* determines which activity the fetcher is requesting and calls that
@@ -94,16 +94,38 @@ class CrawlController extends Controller implements CrawlConstants
if(in_array($activity, $this->activities)) {$this->$activity();}
}
+ /**
+ * Handles a request for whether or not the crawl is stalled on the
+ * given local server (which means no fetcher has spoken to it in a while)
+ * outputs this info back as body of the http response (url encoded,
+ * serialized php data)
+ */
+
function crawlStalled()
{
echo webencode(serialize($this->crawlModel->crawlStalled()));
}
+ /**
+ * Handles a request for the crawl status (memory use, recent fetchers
+ * crawl rate, etc) data from a remote name server
+ * and retrieves that the statistic about this that are held by the
+ * local queue server
+ * outputs this info back as body of the http response (url encoded,
+ * serialized php data)
+ */
function crawlStatus()
{
echo webencode(serialize($this->crawlModel->crawlStatus()));
}
+ /**
+ * Handles a request for information about a crawl with a given timestamp
+ * from a remote name server and retrieves statistics about this crawl
+ * that are held by the local queue server (number of pages, name, etc)
+ * outputs this info back as body of the http response (url encoded,
+ * serialized php data)
+ */
function getInfoTimestamp()
{
$timestamp = 0;
@@ -114,6 +136,13 @@ class CrawlController extends Controller implements CrawlConstants
$timestamp)));
}
+ /**
+ * Handles a request for the crawl list (what crawl are stored on the
+ * machine) data from a remote name server and retrieves the
+ * statistic about this that are held by the local queue server
+ * outputs this info back as body of the http response (url encoded,
+ * serialized php data)
+ */
function getCrawlList()
{
$return_arc_bundles = false;
@@ -127,12 +156,23 @@ class CrawlController extends Controller implements CrawlConstants
$return_arc_bundles, $return_recrawls)));
}
+ /**
+ * Handles a request for the combined crawl list, stalled, and status
+ * data from a remote name server and retrieves that the statistic about
+ * this that are held by the local queue server
+ * outputs this info back as body of the http response (url encoded,
+ * serialized php data)
+ */
function combinedCrawlInfo()
{
$combined = $this->crawlModel->combinedCrawlInfo();
echo webencode(serialize($combined));
}
+ /**
+ * Receives a request to delete a crawl from a remote name server
+ * and then deletes crawl on the local queue server
+ */
function deleteCrawl()
{
if(!isset($_REQUEST["arg"]) ) {
@@ -142,6 +182,11 @@ class CrawlController extends Controller implements CrawlConstants
$this->crawlModel->deleteCrawl($timestamp);
}
+ /**
+ * Receives a request to inject new urls into the active
+ * crawl from a remote name server and then does this for
+ * the local queue server
+ */
function injectUrlsCurrentCrawl()
{
if(!isset($_REQUEST["arg"]) || !isset($_REQUEST["num"])
@@ -154,12 +199,20 @@ class CrawlController extends Controller implements CrawlConstants
$this->crawlModel->injectUrlsCurrentCrawl($inject_urls, NULL);
}
+ /**
+ * Receives a request to stop a crawl from a remote name server
+ * and then stop the current crawl on the local queue server
+ */
function sendStopCrawlMessage()
{
$this->crawlModel->sendStopCrawlMessage();
}
+ /**
+ * Receives a request to start a crawl from a remote name server
+ * and then starts the crawl process on the local queue server
+ */
function sendStartCrawlMessage()
{
diff --git a/lib/bigrams.php b/lib/bigrams.php
index 7a5716a18..4f0857ef0 100644
--- a/lib/bigrams.php
+++ b/lib/bigrams.php
@@ -61,7 +61,6 @@ require_once BASE_DIR."/lib/phrase_parser.php";
*/
class Bigrams
{
-
/**
* Language tags and their corresponding bigram prefix
* @var array
@@ -140,8 +139,6 @@ class Bigrams
return $bigram_phrases;
}
-
-
/**
* Creates a bloom filter file from a bigram text file. The
* path of bigram text file used is based on the input $lang.
diff --git a/lib/index_bundle_iterators/network_iterator.php b/lib/index_bundle_iterators/network_iterator.php
index a86d91af7..8fb267881 100644
--- a/lib/index_bundle_iterators/network_iterator.php
+++ b/lib/index_bundle_iterators/network_iterator.php
@@ -102,9 +102,12 @@ class NetworkIterator extends IndexBundleIterator
/**
* Creates a network iterator with the given parameters.
*
- * @param string $query
- * @param array $queue_servers
- * @param string $timestamp
+ * @param string $query the query that was supplied by the end user
+ * that we are trying to get search results for
+ * @param array $queue_servers urls of yioop instances on which documents
+ * indexes live
+ * @param string $timestamp the timestamp of the particular current index
+ * archive bundles that we look in for results
*/
function __construct($query, $queue_servers, $timestamp)
{
diff --git a/models/crawl_model.php b/models/crawl_model.php
index 913e94f62..a5eaf6286 100755
--- a/models/crawl_model.php
+++ b/models/crawl_model.php
@@ -393,93 +393,6 @@ class CrawlModel extends Model implements CrawlConstants
}
- /**
- * Returns the crawl parameters that were used during a given crawl
- *
- * @param string $timestamp timestamp of the crawl to load the crawl
- * parameters of
- * @return array the first sites to crawl during the next crawl
- * restrict_by_url, allowed, disallowed_sites
- */
- function getCrawlSeedInfo($timestamp)
- {
- $dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp;
- $seed_info = NULL;
- if(file_exists($dir)) {
- $info = IndexArchiveBundle::getArchiveInfo($dir);
- $index_info = unserialize($info['DESCRIPTION']);
- $seed_info['general']["restrict_sites_by_url"] =
- $index_info[self::RESTRICT_SITES_BY_URL];
- $seed_info['general']["crawl_type"] =
- (isset($index_info[self::CRAWL_TYPE])) ?
- $index_info[self::CRAWL_TYPE] : self::WEB_CRAWL;
- $seed_info['general']["crawl_index"] =
- (isset($index_info[self::CRAWL_INDEX])) ?
- $index_info[self::CRAWL_INDEX] : '';
- $seed_info['general']["crawl_order"] =
- $index_info[self::CRAWL_ORDER];
- $site_types = array(
- "allowed_sites" => self::ALLOWED_SITES,
- "disallowed_sites" => self::DISALLOWED_SITES,
- "seed_sites" => self::TO_CRAWL
- );
- foreach($site_types as $type => $code) {
- if(isset($index_info[$code])) {
- $tmp = & $index_info[$code];
- } else {
- $tmp = array();
- }
- $seed_info[$type]['url'] = $tmp;
- }
- $seed_info['meta_words'] = array();
- if(isset($index_info[self::META_WORDS]) ) {
- $seed_info['meta_words'] = $index_info[self::META_WORDS];
- }
- if(isset($index_info[self::INDEXING_PLUGINS])) {
- $seed_info['indexing_plugins']['plugins'] =
- $index_info[self::INDEXING_PLUGINS];
- }
- }
- return $seed_info;
- }
-
- /**
- * Changes the crawl parameters of an existing crawl
- *
- * @param string $timestamp timestamp of the crawl to change
- * @param array $new_info the new parameters
- */
- function setCrawlSeedInfo($timestamp, $new_info)
- {
- $dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp;
- if(file_exists($dir)) {
- $info = IndexArchiveBundle::getArchiveInfo($dir);
- $index_info = unserialize($info['DESCRIPTION']);
- if(isset($new_info['general']["restrict_sites_by_url"])) {
- $index_info[self::RESTRICT_SITES_BY_URL] =
- $new_info['general']["restrict_sites_by_url"];
- }
- $updatable_site_info = array(
- "allowed_sites" => self::ALLOWED_SITES,
- "disallowed_sites" => self::DISALLOWED_SITES
- );
- foreach($updatable_site_info as $type => $code) {
- if(isset($new_info[$type])) {
- $index_info[$code] = $new_info[$type]['url'];
- }
- }
- if(isset($new_info['meta_words']) ) {
- $index_info[self::META_WORDS] = $new_info['meta_words'];
- }
- if(isset($new_info['indexing_plugins']) ) {
- $index_info[self::INDEXING_PLUGINS] =
- $new_info['indexing_plugins']['plugins'];
- }
- $info['DESCRIPTION'] = serialize($index_info);
- IndexArchiveBundle::setArchiveInfo($dir, $info);
- }
- }
-
/**
* Returns the initial sites that a new crawl will start with along with
@@ -594,6 +507,99 @@ EOT;
file_put_contents(WORK_DIRECTORY."/crawl.ini", $out);
}
+
+ /**
+ * Returns the crawl parameters that were used during a given crawl
+ *
+ * @param string $timestamp timestamp of the crawl to load the crawl
+ * parameters of
+ * @return array the first sites to crawl during the next crawl
+ * restrict_by_url, allowed, disallowed_sites
+ * @param array $machine_urls an array of urls of yioop queue servers
+ *
+ */
+ function getCrawlSeedInfo($timestamp, $machine_urls = NULL)
+ {
+ $dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp;
+ $seed_info = NULL;
+ if(file_exists($dir)) {
+ $info = IndexArchiveBundle::getArchiveInfo($dir);
+ $index_info = unserialize($info['DESCRIPTION']);
+ $seed_info['general']["restrict_sites_by_url"] =
+ $index_info[self::RESTRICT_SITES_BY_URL];
+ $seed_info['general']["crawl_type"] =
+ (isset($index_info[self::CRAWL_TYPE])) ?
+ $index_info[self::CRAWL_TYPE] : self::WEB_CRAWL;
+ $seed_info['general']["crawl_index"] =
+ (isset($index_info[self::CRAWL_INDEX])) ?
+ $index_info[self::CRAWL_INDEX] : '';
+ $seed_info['general']["crawl_order"] =
+ $index_info[self::CRAWL_ORDER];
+ $site_types = array(
+ "allowed_sites" => self::ALLOWED_SITES,
+ "disallowed_sites" => self::DISALLOWED_SITES,
+ "seed_sites" => self::TO_CRAWL
+ );
+ foreach($site_types as $type => $code) {
+ if(isset($index_info[$code])) {
+ $tmp = & $index_info[$code];
+ } else {
+ $tmp = array();
+ }
+ $seed_info[$type]['url'] = $tmp;
+ }
+ $seed_info['meta_words'] = array();
+ if(isset($index_info[self::META_WORDS]) ) {
+ $seed_info['meta_words'] = $index_info[self::META_WORDS];
+ }
+ if(isset($index_info[self::INDEXING_PLUGINS])) {
+ $seed_info['indexing_plugins']['plugins'] =
+ $index_info[self::INDEXING_PLUGINS];
+ }
+ }
+ return $seed_info;
+ }
+
+ /**
+ * Changes the crawl parameters of an existing crawl
+ *
+ * @param string $timestamp timestamp of the crawl to change
+ * @param array $new_info the new parameters
+ * @param array $machine_urls an array of urls of yioop queue servers
+ */
+ function setCrawlSeedInfo($timestamp, $new_info, $machine_urls = NULL)
+ {
+ $dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp;
+ if(file_exists($dir)) {
+ $info = IndexArchiveBundle::getArchiveInfo($dir);
+ $index_info = unserialize($info['DESCRIPTION']);
+ if(isset($new_info['general']["restrict_sites_by_url"])) {
+ $index_info[self::RESTRICT_SITES_BY_URL] =
+ $new_info['general']["restrict_sites_by_url"];
+ }
+ $updatable_site_info = array(
+ "allowed_sites" => self::ALLOWED_SITES,
+ "disallowed_sites" => self::DISALLOWED_SITES
+ );
+ foreach($updatable_site_info as $type => $code) {
+ if(isset($new_info[$type])) {
+ $index_info[$code] = $new_info[$type]['url'];
+ }
+ }
+ if(isset($new_info['meta_words']) ) {
+ $index_info[self::META_WORDS] = $new_info['meta_words'];
+ }
+ if(isset($new_info['indexing_plugins']) ) {
+ $index_info[self::INDEXING_PLUGINS] =
+ $new_info['indexing_plugins']['plugins'];
+ }
+ $info['DESCRIPTION'] = serialize($index_info);
+ IndexArchiveBundle::setArchiveInfo($dir, $info);
+ }
+ }
+
+
+
/**
* Get a description associated with a Web Crawl or Crawl Mix
*
@@ -745,6 +751,7 @@ EOT;
/**
* Used to send a message to the queue_servers to stop a crawl
+ * @param array $machine_urls an array of urls of yioop queue servers
*/
function sendStopCrawlMessage($machine_urls = NULL)
{
@@ -844,7 +851,13 @@ EOT;
}
/**
+ * When @see getCrawlList() is used in a multi-queue_server this method
+ * used to integrate the crawl lists received by the different machines
*
+ * @param array $list_strings serialized crawl list data from different
+ * queue_servers
+ * @param string $data_field field of $list_strings to use for data
+ * @return array list of crawls and their meta data
*/
function aggregateCrawlList($list_strings, $data_field = NULL)
{
@@ -874,7 +887,15 @@ EOT;
return $list;
}
/**
+ * Determines if the length of time since any of the fetchers has spoken
+ * with any of the queue_servers has exceeded CRAWL_TIME_OUT. If so,
+ * typically the caller of this method would do something such as officially
+ * stop the crawl.
*
+ * @param array $list_strings serialized crawl list data from different
+ * queue_servers
+ * @param array $machine_urls an array of urls of yioop queue servers
+ * @return bool whether the current crawl is stalled or not
*/
function crawlStalled($machine_urls = NULL)
{
@@ -884,7 +905,7 @@ EOT;
}
if(file_exists(CRAWL_DIR."/schedules/crawl_status.txt")) {
- //assume if status not updated forCRAWL_TIME_OUT
+ //assume if status not updated for CRAWL_TIME_OUT
// crawl not active (do check for both scheduler and indexer)
if(filemtime(
CRAWL_DIR."/schedules/crawl_status.txt") +
@@ -903,7 +924,18 @@ EOT;
}
/**
+ * When @see crawlStalled() is used in a multi-queue_server this method
+ * used to integrate the stalled information received by the different
+ * machines
+ *
+ * @param array $stall_statuses contains web encoded serialized data one
+ * one field of which has the boolean data concerning stalled statis
*
+ * @param string $data_field field of $stall_statuses to use for data
+ * if NULL then each element of $stall_statuses is a wen encoded
+ * serialized boolean
+ * @return bool true if at list one queue_server has heard from one
+ * fetcher within the time out period
*/
function aggregateStalled($stall_statuses, $data_field = NULL)
{
@@ -920,7 +952,13 @@ EOT;
}
/**
+ * Returns data about current crawl such as DESCRIPTION, TIMESTAMP,
+ * peak memory of various processes, most recent fetcher, most recent
+ * urls, urls seen, urls visited, etc.
*
+ * @param array $machine_urls an array of urls of yioop queue servers
+ * on which the crawl is being conducted
+ * @return array associative array of the said data
*/
function crawlStatus($machine_urls = NULL)
{
@@ -972,7 +1010,15 @@ EOT;
}
/**
+ * When @see crawlStatus() is used in a multi-queue_server this method
+ * used to integrate the status information received by the different
+ * machines
*
+ * @param array $status_strings
+ * @param string $data_field field of $status_strings to use for data
+ * @return array associative array of DESCRIPTION, TIMESTAMP,
+ * peak memory of various processes, most recent fetcher, most recent
+ * urls, urls seen, urls visited, etc.
*/
function aggregateStatuses($status_strings, $data_field = NULL)
{
@@ -1034,7 +1080,14 @@ EOT;
}
/**
+ * This method is used to reduce the number of network requests
+ * needed by the crawlStatus method of admin_controller. It returns
+ * an array containing the results of the @see crawlStalled
+ * @see crawlStatus and @see getCrawlList methods
*
+ * @param array $machine_urls an array of urls of yioop queue servers
+ * @return array containing three components one for each of the three
+ * kinds of results listed above
*/
function combinedCrawlInfo($machine_urls = NULL)
{
@@ -1059,7 +1112,12 @@ EOT;
}
/**
+ * Add the provided urls to the schedule directory of URLs that will
+ * be crawled
*
+ * @param array $inject_urls urls to be added to the schedule of
+ * the active crawl
+ * @param array $machine_urls an array of urls of yioop queue servers
*/
function injectUrlsCurrentCrawl($inject_urls, $machine_urls = NULL)
{
@@ -1107,7 +1165,23 @@ EOT;
}
/**
+ * This method is invoked by other crawlModel methods when they
+ * want to have their method performed on an array of other
+ * Yioop instances. The results returned can then be aggregated.
+ * The invokation sequence is
+ * crawlModelMethodA invokes execMachine with a list of
+ * urls of other Yioop instances. execMachine makes REST requests of
+ * those instances of the given command and optional arguments
+ * This request would be handled by a CrawlController which in turn
+ * calls crawlModelMethodA on the given Yioop instance, serializes the
+ * result and gives it back to execMachine and then back to the originally
+ * calling function.
*
+ * @param string $command the CrawlModel method to invoke on the remote
+ * Yioop instances
+ * @param array $machine_urls machines to invoke this command on
+ * @param string additional arguments to be passed to the remote machine
+ * @return array a list of outputs from each machine that was called.
*/
function execMachines($command, $machine_urls, $arg = NULL)
{