diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index 36a6fb081..472555beb 100755 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -601,11 +601,15 @@ class QueueServer implements CrawlConstants return; } $filters = ($process == self::INDEXER) ? ["Indexer"] : ["Scheduler"]; - $process_lines = L\lineFilter($lines, $filters); - L\crawlLog("...Filtered " . $this->process_name . ".log lines"); + $initial = ($process == self::INDEXER) ? "I" : "S"; + $process_lines = L\lineFilter($lines, $filters, false); $num_lines = count($process_lines); - $last_process_timestamp = $time; + L\crawlLog("...Filtered " . $this->process_name . ".log lines ". + "looking for $initial process. Found $num_lines associated with ". + "process."); // err on the side of caution in assuming process dead + $last_process_timestamp = (!empty($lines[0])) ? + L\logLineTimestamp($process_lines[$num_lines - 1]) : $time; if (isset($process_lines[$num_lines - 1])) { $timestamp = L\logLineTimestamp($process_lines[$num_lines - 1]); @@ -1984,7 +1988,7 @@ class QueueServer implements CrawlConstants $etag_expires_data = unserialize(gzuncompress(L\webdecode(file_get_contents($file)))); L\crawlLog("Scheduler Done uncompressing etag data.". - " Starting to add to btree"); + " Starting to add to linear hash table"); $num_entries = count($etag_expires_data); $i = 0; foreach ($etag_expires_data as $data) { diff --git a/src/library/BPlusTree.php b/src/library/BPlusTree.php index 4fa6ec0c8..616d96eb3 100644 --- a/src/library/BPlusTree.php +++ b/src/library/BPlusTree.php @@ -59,15 +59,21 @@ class BPlusTree "MAX_KEYS" => self::MAX_KEYS ]; /** - * + * Internal nodes of BPlusTree are folders, subfolders/subfiles are + * names according to their least key except for the first subdfolder/ + * subfile of the node which is given the name of the LEAST_NODE_NAME + * constant */ const LEAST_NODE_NAME = "start"; /** - * + * Internal nodes of BPlusTree are folders. For nodes of the same + * height in the tree NEXT_NODE_NAME is used as the name of the file + * with the serialized name of the next folder of the same height in + * the tree. */ const NEXT_NODE_NAME = "next"; /** - * + * Name of temporary file used when splitting a BPlusTree node. */ const TEMP_NODE_NAME = "tmp_node"; /** diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php index 25af4ef5b..d5f7ea2e6 100644 --- a/src/library/IndexDocumentBundle.php +++ b/src/library/IndexDocumentBundle.php @@ -41,44 +41,62 @@ require_once __DIR__ . '/Utility.php'; * from these documents which allow one to search for documents containing a * particular word. * - * - * * @author Chris Pollett */ class IndexDocumentBundle implements CrawlConstants { /** - * + * File name used to store withing the folder of the IndexDocumentBundle + * parameter/configuration information about the bundle */ const ARCHIVE_INFO_FILE = "archive_info.txt"; /** - * + * The version of this IndexDocumentBundle. The lowest format number is + * 3.0 as prior inverted index/document stores used IndexArchiveBundle's */ const DEFAULT_VERSION = "3.0"; /** - * + * Default values for the configuration parameters of an + * IndexDocumentBundle */ const DEFAULT_PARAMETERS = ["DESCRIPTION" => "", "VERSION" => self::DEFAULT_VERSION ]; /** - * + * Subfolder of IndexDocumentBundle to store the btree with + * term => posting list information (i.e., the inverted index) */ const DICTIONARY_FOLDER = "dictionary"; /** - * + * DocIds are made of three parts: hash of url, hash of document, hash + * of url hostname. Each of these hashes is DOCID_PART_LEN long */ const DOCID_PART_LEN = 8; /** - * + * Length of DocIds used by this IndexDocumentBundle */ const DOCID_LEN = 24; /** - * + * Partition i in an IndexDocumentBundle has a subfolder i + * within self::POSITIONS_DOC_MAP_FOLDER. Within this subfolder i, + * self::DOC_MAP_FILENAME is the name of the file used to store the + * document map for the partition. The document map consists of a sequence + * of records associated with each doc_id of a document stored in the + * partition. The first record is ["POS" => $num_words, + * "SCORE" => floatval($global_score_for_document)]. The second record is: + * ["POS" => $length_of_title_of_document, "SCORE" => + * floatval($num_description_scores)]] + * Here a description score is a score for the importance for a section + * of a document. Subsequence records, list [POS => the length of the jth + * section of the document, SCORE => its score]. */ const DOC_MAP_FILENAME = "doc_map"; /** - * + * Folder used to store the partition data of this IndexDocumentBundle + * These will consits of .txt.gz files for each partition which are used + * to store summaries of documents and actual documents (web pages) and + * .ix files which are used to store doc_id and the associated offets to + * their summary and actual document within the .txt.gz file */ const DOCUMENTS_FOLDER = "documents"; /** @@ -319,8 +337,8 @@ class IndexDocumentBundle implements CrawlConstants $num_postings = count($postings); $i = 0; foreach ($postings as $term => $entry) { - if(crawlTimeoutLog("..Still processing partition $partition. Have ". - "completed $i postings of $num_postings.") && + if(crawlTimeoutLog("..Indexer Still processing partition ". + "$partition. Have completed $i postings of $num_postings.") && $taking_too_long_touch) { if (file_exists($taking_too_long_touch)) { touch($taking_too_long_touch, time()); diff --git a/src/library/PartitionDocumentBundle.php b/src/library/PartitionDocumentBundle.php index 2271d3397..8c7c6bf96 100644 --- a/src/library/PartitionDocumentBundle.php +++ b/src/library/PartitionDocumentBundle.php @@ -290,7 +290,16 @@ class PartitionDocumentBundle return $out_data; } /** + * Retrieve a BLOB string in the file $archive_filename at byte position + * $offset of length $len. It uncompresses this string using + * $compressor->uncompress and return the result. * + * @param string $archive_filename the filename of a partition archive + * file to get a blob object from + * @param int $offset a byte position in that file + * @param int $len number of bytes from $offset to read. + * @return string the result of uncompressing the string at $offset of + * length $len */ public function getArchive($archive_filename, $offset, $len) { @@ -316,11 +325,11 @@ class PartitionDocumentBundle return $value; } /** - * Returns the path of the archive file (used to store BLOB and SERIAL + * Returns the path to the archive file (used to store BLOB and SERIAL * columns) for the $i partition in this PartitionDocumentBundle * * @param int $i partition to get the archive file name for - * @return string path of $i partition + * @return string path of $i partition archive file */ public function getPartition($i) { @@ -328,7 +337,12 @@ class PartitionDocumentBundle $i . $this->compressor->fileExtension(); } /** + * Returns the path to the index file (used to store all columns + * a partition record except blob and serial columns) for the $i partition + * in this PartitionDocumentBundle * + * @param int $i partition to get the index file name for + * @return string path of $i partition index file */ public function getPartitionIndex($i) { @@ -336,7 +350,15 @@ class PartitionDocumentBundle $i . self::INDEX_EXTENSION; } /** + * Returns the unserialized index file for the $partition parition of + * this PartitionIndexBundle. If $force_load is set to true then reloads + * from disk rather than use a cached value if present. * + * @param int $partition which partition index to read + * @param bool $force_load whether to reload the index from disk or to + * use a cached value if present + * @return array $key => packed records pairs where records are + * packed according to this ParititionDocumentBundle's signature */ public function loadPartitionIndex($partition, $force_load = false) { @@ -379,6 +401,7 @@ class PartitionDocumentBundle * * @param array $row_or_rows either array of record with fields given * by this PartitionDocumentBundle's signature or an array of rows. + * @return bool success or not */ public function put($row_or_rows) { @@ -513,7 +536,7 @@ class PartitionDocumentBundle $this->saveParameters(); } /** - * + * */ public function saveParameters() { diff --git a/src/library/Utility.php b/src/library/Utility.php index bfd3f9eb8..14903a903 100755 --- a/src/library/Utility.php +++ b/src/library/Utility.php @@ -2336,17 +2336,20 @@ function tail($file_name, $num_lines) * @param string $lines to search * @param mixed $filters either string to filter lines with or an array of * strings (any of which can be present to pass the filter) + * @param bool $case_insensitive whether search should be done case + * insensitively or not. * @return array lines containing the string */ -function lineFilter($lines, $filters) +function lineFilter($lines, $filters, $case_insensitive = true) { $out_lines = []; + $search_function = ($case_insensitive) ? "stripos" : "strpos"; if (is_string($filters)) { $filters = [$filters]; } foreach ($lines as $line) { foreach ($filters as $filter) { - if (stripos($line, $filter) !== false) { + if ($search_function($line, $filter) !== false) { $out_lines[] = $line; break; } diff --git a/src/views/helpers/OptionsHelper.php b/src/views/helpers/OptionsHelper.php index d0f8f330d..143cf91cc 100755 --- a/src/views/helpers/OptionsHelper.php +++ b/src/views/helpers/OptionsHelper.php @@ -132,6 +132,9 @@ class OptionsHelper extends Helper * or as an unordered list. * @param string $class_list a string of additional CSS classes for outer- * most div tag + * @param string $show_top if empty then the selected item value will be the + * clickable link at the top of the drop down, if non-empty + * then the text of $show_top will be used. */ public function renderLinkDropDown($id, $options, $selected, $url_prefix, $as_list = false, $class_list = "", $show_top = "") diff --git a/tests/IndexDocumentBundleTest.php b/tests/IndexDocumentBundleTest.php index 9ea2e35af..0e74125f2 100644 --- a/tests/IndexDocumentBundleTest.php +++ b/tests/IndexDocumentBundleTest.php @@ -82,7 +82,8 @@ use seekquarry\yioop\library\UnitTest; $this->assertEqual($archive_info["DESCRIPTION"], "TestBundle"); } /** - * + * Tests that after adding pages to an IndexArchiveBundle, the page, + * and its summary can be retrieved. */ public function addGetPagesTestCase() { diff --git a/tests/LinearHashTableTest.php b/tests/LinearHashTableTest.php index 97470b1b5..bc9c63e75 100644 --- a/tests/LinearHashTableTest.php +++ b/tests/LinearHashTableTest.php @@ -38,7 +38,10 @@ use seekquarry\yioop\models\Model; use seekquarry\yioop\library\UnitTest; /** + * Used to test that the LinearHashTable class properly stores key value pairs, + * handles insert, deletes, retrievals okay. * + * @author Chris Pollett */ class LinearHashTableTest extends UnitTest { @@ -81,7 +84,9 @@ use seekquarry\yioop\library\UnitTest; $this->table_dirs = []; } /** - * + * This tests that packed records can be successfully unpacked + * after being put into the linear hash table and retrieved. + * This tests the LinearHashTable use-case for storing ETag data */ public function packUnpackFormatTestCase() { @@ -133,7 +138,8 @@ use seekquarry\yioop\library\UnitTest; } } /** - * + * Tests whether key value pairs inserted into the linear hash table + * can subsequently be retrieved. This tests the hashed key case */ public function insertHashKeyLookupTestCase() { @@ -154,6 +160,8 @@ use seekquarry\yioop\library\UnitTest; } } /** + * Tests whether key value pairs inserted into the linear hash table + * can subsequently be retrieved. This tests the non-hashed key case */ public function insertKeyLookupTestCase() {