Fix a crash problem in QueueServer caused by array access on false in robots handling, a=chris
Fix a crash problem in QueueServer caused by array access on false in robots handling, a=chris
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 1ce78c81b..6e0019de2 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -1925,7 +1925,7 @@ class QueueServer implements CrawlConstants
$sites = unserialize(gzuncompress(
L\webdecode(file_get_contents($file))));
L\crawlLog("Scheduler done decompressing robot file");
- if (isset($sites)) {
+ if (!empty($sites)) {
$num_sites = count($sites);
$i = 0;
foreach ($sites as $robot_host => $robot_info) {
diff --git a/src/library/CrawlDaemon.php b/src/library/CrawlDaemon.php
index 231c7678e..8186c9a51 100644
--- a/src/library/CrawlDaemon.php
+++ b/src/library/CrawlDaemon.php
@@ -67,7 +67,7 @@ class CrawlDaemon implements CrawlConstants
*/
public static $mode;
/**
- * Tick callback function used to update the timestamp in this processes
+ * Callback function used to update the timestamp in this processes
* lock. If lock_file does not exist or more than PROCESS_TIMEOUT
* time has elapsed since the last processHandler call it stops the process
*
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index bfd0bd668..307733e17 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -100,7 +100,10 @@ class IndexDocumentBundle implements CrawlConstants
*/
const DOCUMENTS_FOLDER = "documents";
/**
- *
+ * Name of the last entries file used to help compute difference lists
+ * for doc_index, and position list offsets used in postings for the
+ * partition. This file is also used to track the total number of
+ * occurences of term in a partition
*/
const LAST_ENTRIES_FILENAME = "last_entries";
/**
@@ -114,7 +117,8 @@ class IndexDocumentBundle implements CrawlConstants
self::LAST_ENTRIES_FILENAME, self::POSITIONS_FILENAME,
self::POSTINGS_FILENAME];
/**
- *
+ * Name of the file within a paritions positions_doc_maps folder used
+ * to contain the partitions position list for each term.
*/
const POSITIONS_FILENAME = "positions";
/**
@@ -122,7 +126,12 @@ class IndexDocumentBundle implements CrawlConstants
*/
const POSTINGS_FILENAME = "postings";
/**
- *
+ * Name of the folder used to hold position lists and document maps. Within
+ * this folder there is a subfolder for each partition which contains a
+ * doc_map file, postings file for the docs within the partition,
+ * position lists file for those postings, and a last_entries file
+ * used in the computation of difference list for doc_index and position
+ * list offsets, as well as number of occurences of terms.
*/
const POSITIONS_DOC_MAP_FOLDER = "positions_doc_maps";
/**
@@ -154,15 +163,17 @@ class IndexDocumentBundle implements CrawlConstants
*/
public $dictionary;
/**
- * Makes or initializes an IndexArchiveBundle with the provided parameters
+ * Makes or initializes an IndexDocumentBundle with the provided parameters
*
* @param string $dir_name folder name to store this bundle
* @param bool $read_only_archive whether to open archive only for reading
* or reading and writing
* @param string $description a text name/serialized info about this
- * IndexArchiveBundle
- * @param int $num_docs_per_partition the number of pages to be stored
- * in a single shard
+ * IndexDocumentBundle
+ * @param int $num_docs_per_partition the number of documents to be stored
+ * in a single partition
+ * @param int $max_keys the maximum number of keys used by the BPlusTree
+ * used for the inverted index
*/
public function __construct($dir_name, $read_only_archive = true,
$description = null, $num_docs_per_partition =
@@ -248,7 +259,6 @@ class IndexDocumentBundle implements CrawlConstants
}
/**
*
- * @param int $add_num_docs number of docs in the shard about to be added
* @param string $taking_too_long_touch a filename of a file to touch
* so its last modified time becomes the current time. In a typical
* Yioop crawl this is done for the crawl_status.txt file to prevent