diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index 472555beb..25dc5beae 100755 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -605,8 +605,8 @@ class QueueServer implements CrawlConstants $process_lines = L\lineFilter($lines, $filters, false); $num_lines = count($process_lines); L\crawlLog("...Filtered " . $this->process_name . ".log lines ". - "looking for $initial process. Found $num_lines associated with ". - "process."); + "looking for $initial process. Found $num_lines lines associated ". + "with process."); // err on the side of caution in assuming process dead $last_process_timestamp = (!empty($lines[0])) ? L\logLineTimestamp($process_lines[$num_lines - 1]) : $time; @@ -807,6 +807,7 @@ class QueueServer implements CrawlConstants if ($this->isOnlyIndexer()) { return; } + //so isScheduler true here $this->processRobotUrls(); if (C\USE_ETAG_EXPIRES) { $this->processEtagExpires(); diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php index d5f7ea2e6..92105d3ae 100644 --- a/src/library/IndexDocumentBundle.php +++ b/src/library/IndexDocumentBundle.php @@ -736,8 +736,9 @@ class IndexDocumentBundle implements CrawlConstants } } /** - * + * Checks if a doc_id correspons to a document or a link * @param string $key + * @return bool true if a document */ public function isDoc($key) { diff --git a/src/library/LinearHashTable.php b/src/library/LinearHashTable.php index 7e793f703..80d309629 100644 --- a/src/library/LinearHashTable.php +++ b/src/library/LinearHashTable.php @@ -342,7 +342,13 @@ class LinearHashTable self::HASH_INDEX_EXTENSION; } /** - * + * Checks if a key exists in the linear hash table + * @param string $key key to check + * @param bool $compute_hash whether the key has already had the linear + * hash table's hash function applied or not + * @param bool $check_active whether to check the active index of + * buffered key values that have not yet been put into the main table + * @return bool whether the $key exists in the linear hash table */ public function exists($key, $compute_hash = true, $check_active = true) { @@ -373,6 +379,9 @@ class LinearHashTable } /** * + * @param array $row_or_rows + * @param bool $is_hash_key + * @param bool $allow_duplicates */ public function put($row_or_rows, $is_hash_key = false, $allow_duplicates = true) diff --git a/src/library/PartitionDocumentBundle.php b/src/library/PartitionDocumentBundle.php index 8c7c6bf96..0a3b1200e 100644 --- a/src/library/PartitionDocumentBundle.php +++ b/src/library/PartitionDocumentBundle.php @@ -100,55 +100,87 @@ class PartitionDocumentBundle */ const PARTITION_SIZE_THRESHOLD = 2147483648; /** - * + * Used to store the file handle to, the partition number, and last add time + * for the last time an item's blob/serial columns were added to for + * the PartitionDocumentBundle + * @var array */ public $add_archive_cache = [null, "", -1]; /** - * + * Used to store the file handle to, the partition number, and last access + * time for the last time an item's blob/serial columns were accessed for + * the PartitionDocumentBundle + * @var array */ public $get_archive_cache = [null, "", -1]; /** - * + * Array of column names for the columns in a PartitionDocumentBundle which + * are of type BLOB or SERIAL + * @var array */ public $blob_columns; /** - * + * The seekquarry\yioop\library\compressors\Compressor object used to + * compress record files and blob items. + * @var object */ public $compressor; /** - * + * Folder path where the PartitionDocumentBundle is stored + * @var string */ public $folder; /** - * + * In memory cache of partitions from the PartitionDocumentBundle + * @var array */ public $index_cache; /** - * + * Maximum number of items the partition cache is allowed to hold */ public $index_cache_size; /** - * + * hi-res time the file handle for add or getting filehandle blob was + * created. Used to determine if the file_handle needs to be closed, because + * active partition changed + * @var int */ public $instance_time; /** - * + * Name of primary key column for records + * @var string */ public $key_field; /** - * + * Stores the constructor paramters used to create this + * PartitionDocumentBundle + * @var array */ public $parameters; /** - * + * Array of column names for the columns in a PartitionDocumentBundle which + * are of type SERIAL + * @var array */ public $serial_columns; /** - * + * The PackedTableTools object used to pack and unpack records in + * partitions + * @var object */ public $table_tools; /** + * Used to create a new instance of a PartitionDocumentBundle * + * @param string $folder the path to the folder to store this + * PartitionDocumentBundle + * @param array $format the column names, keys and types for this + * PartitionDocumentBundle object + * @param int $partition_size_threshold maximum number of items to store + * in a parition before making the next partition + * @param object $compressor_type + * seekquarry\yioop\library\compressors\Compressor object used to + * compress record files and blob items. */ public function __construct($folder, $format = self::DEFAULT_PARAMETERS["FORMAT"], @@ -351,7 +383,7 @@ class PartitionDocumentBundle } /** * Returns the unserialized index file for the $partition parition of - * this PartitionIndexBundle. If $force_load is set to true then reloads + * this PartitionDocumentBundle. If $force_load is set to true then reloads * from disk rather than use a cached value if present. * * @param int $partition which partition index to read @@ -489,8 +521,12 @@ class PartitionDocumentBundle return true; } /** + * Saves the current save partition, adds one to the save partition number, + * and starts a new save partition. * - * @param int $new_save_partition + * @param int $new_save_partition partition and add one to. If use default, + * then this method will use the parameters "SAVE_PARTITION" + * value. */ public function advanceSavePartition($new_save_partition = 0) { @@ -536,7 +572,7 @@ class PartitionDocumentBundle $this->saveParameters(); } /** - * + * Save the operataing parameters of this PartitionDocumentBundle */ public function saveParameters() { diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php index 91a72366b..7ae376745 100644 --- a/src/library/index_bundle_iterators/WordIterator.php +++ b/src/library/index_bundle_iterators/WordIterator.php @@ -487,7 +487,7 @@ class WordIterator extends IndexBundleIterator if (empty($descriptions_scores)) { return count($positions); } - $first_score = $descriptions_scores[0]['SCORE']; + $first_score = $descriptions_scores[0]['SCORE'] ?? 1; $description_pos = $descriptions_scores[$description_index]['POS']; $num_scores = count($descriptions_scores); $raw_freq_squared = 0; diff --git a/tests/BPlusTreeTest.php b/tests/BPlusTreeTest.php index a837a715f..ae210cc1b 100644 --- a/tests/BPlusTreeTest.php +++ b/tests/BPlusTreeTest.php @@ -86,9 +86,10 @@ use seekquarry\yioop\library\UnitTest; return new L\BPlusTree($table_dir, $format, $max_degree); } /** - * + * Test putting items in bplustrees of odd sized nodes between 3 adn 13 and + * then seeing if the items can be retrieved */ - public function getPutTestCase() + public function putGetTestCase() { for ($i = 3; $i <= 13; $i += 2) { $bptree = $this->createTree($i); diff --git a/tests/IndexDocumentBundleTest.php b/tests/IndexDocumentBundleTest.php index 0e74125f2..2e67f3f0c 100644 --- a/tests/IndexDocumentBundleTest.php +++ b/tests/IndexDocumentBundleTest.php @@ -41,7 +41,11 @@ use seekquarry\yioop\library\UrlParser; use seekquarry\yioop\library\UnitTest; /** - * + * Used to test that the IndexDocumentBundle class can properly add and + * retrieve documents. Check its prepareMethod correctly deduplicates + * documents before inverted index creation. Tests inverted index creation + * and adding terms to IndexDocumentBundle's BPlusTree. Check look up of + * documents according to term. */ class IndexDocumentBundleTest extends UnitTest { @@ -50,7 +54,8 @@ use seekquarry\yioop\library\UnitTest; */ const TEST_DIR = __DIR__ . '/test_files/index_document_test'; /** - * + * Holds the IndexDocumentBundle used for test purposes + * @var IndexDocumentBundle */ public $index_archive; /** @@ -74,7 +79,8 @@ use seekquarry\yioop\library\UnitTest; $model->db->unlinkRecursive(self::TEST_DIR); } /** - * + * Checks if the constructor of the IndexDocumentBundle correctly save + * the constructor info such as the bundle description */ public function saveDescriptionTestCase() { @@ -109,7 +115,10 @@ use seekquarry\yioop\library\UnitTest; } } /** - * + * Tests the prepareIndexMap method which is used to deduplicate pages + * before an inverted index of a partition is made. Tests adding pages + * pages with the same doc_id to make sure will get grouped together + * Grouping also affect how documents are scored so tests this as well. */ public function prepareIndexTestCase() { @@ -177,7 +186,10 @@ use seekquarry\yioop\library\UnitTest; } } /** - * + * Tests the process of added documents to the IndexDocumentBundle, then + * building an inverted index from this. To check after the above is + * done perform lookup's of terms known to have posting list + * and then checking the properties of the returned posting lists. */ public function buildInvertedIndexPartitionTestCase() { @@ -248,7 +260,11 @@ use seekquarry\yioop\library\UnitTest; "Test Position List Decode"); } /** - * + * Tests the complete process of going for documents, dedup, + * building an inverted index and adding the result to the + * IndexDocumentBundle's inverted index. To this after the above is + * done perform lookup's of terms known to be in the indexed documents + * and check the properties of the returned posting lists. */ public function addPartitionPostingsDictionaryTestCase() {