diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index cd266739f..8b2d955ce 100755 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -599,8 +599,7 @@ class QueueServer implements CrawlConstants "running. Assume still running."); return; } - $filters = ($process == self::INDEXER) ? ["Indexer", "merg", - "index shard"] : ["Scheduler"]; + $filters = ($process == self::INDEXER) ? ["Indexer"] : ["Scheduler"]; $process_lines = L\lineFilter($lines, $filters); L\crawlLog("...Filtered " . $this->process_name . ".log lines"); $num_lines = count($process_lines); diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php index e6df704fe..4c8997e92 100644 --- a/src/library/IndexDocumentBundle.php +++ b/src/library/IndexDocumentBundle.php @@ -222,7 +222,7 @@ class IndexDocumentBundle implements CrawlConstants */ public function addPages($pages, $visited_urls_count) { - crawlLog("Adding pages to document bundle..."); + crawlLog("Indexer adding pages to document bundle..."); $success = $this->documents->put($pages); $this->documents->addCount($visited_urls_count, "VISITED_URLS_COUNT"); @@ -231,7 +231,11 @@ class IndexDocumentBundle implements CrawlConstants /** * * @param int $add_num_docs number of docs in the shard about to be added - * @param string $taking_too_long_touch + * @param string $taking_too_long_touch a filename of a file to touch + * so its last modified time becomes the current time. In a typical + * Yioop crawl this is done for the crawl_status.txt file to prevent + * Yioop's web interface from stopping the crawl because it has seen + * no recent progress activity on a crawl. */ public function updateDictionary($taking_too_long_touch = null) { @@ -270,9 +274,16 @@ class IndexDocumentBundle implements CrawlConstants return $advanced_partition; } /** + * Adds the previously constructed inverted index $partition to the inverted + * index of the whole bundle * - * @param int $partition; - * @param string $taking_too_long_touch + * @param int $partition which partitions inverted index to add, by + * default the current save partition + * @param string $taking_too_long_touch a filename of a file to touch + * so its last modified time becomes the current time. In a typical + * Yioop crawl this is done for the crawl_status.txt file to prevent + * Yioop's web interface from stopping the crawl because it has seen + * no recent progress activity on a crawl. */ public function addPartitionPostingsDictionary($partition = -1, $taking_too_long_touch = null) @@ -374,21 +385,26 @@ class IndexDocumentBundle implements CrawlConstants * Builds an inverted index shard for a documents PartitionDocumentBundle * partition. * @param int $partition to build index for - * @param string $taking_too_long_touch + * @param string $taking_too_long_touch a filename of a file to touch + * so its last modified time becomes the current time. In a typical + * Yioop crawl this is done for the crawl_status.txt file to prevent + * Yioop's web interface from stopping the crawl because it has seen + * no recent progress activity on a crawl. * @return mixed whether job executed to completion (true or false) if - * !$just_stats, otherwise, an array with NUM_DOCS, NUM_LINKS, - * and TERM_STATISTICS (the latter having term frequency info) + * !$just_stats, otherwise, an array with NUM_DOCS, NUM_LINKS, + * and TERM_STATISTICS (the latter having term frequency info) */ public function buildInvertedIndexPartition($partition = -1, $taking_too_long_touch = null, $just_stats = false) { $start_time = microtime(true); - crawlLog(" Start building inverted index ... Current Memory:". + crawlLog(" Indexer start building inverted index ... Current Memory:". memory_get_usage()); if ($partition < 0) { $partition = $this->documents->parameters["SAVE_PARTITION"]; } - crawlLog("Building index inverted index for partition $partition"); + crawlLog( + "Indexer Building index inverted index for partition $partition"); $base_folder = $this->getPartitionBaseFolder($partition); if (!file_exists($base_folder)) { mkdir($base_folder); @@ -411,7 +427,7 @@ class IndexDocumentBundle implements CrawlConstants $postings = []; $last_entries = []; $positions = ""; - crawlLog("Preparing Index Map..."); + crawlLog("Indexer Preparing Index Map..."); $index_map = $this->prepareIndexMap($partition); crawlLog("Number of documents in mapped partition:" . count($index_map)); @@ -577,13 +593,13 @@ class IndexDocumentBundle implements CrawlConstants $word_lists, $meta_ids, $doc_map_index, $last_entries); $interim_elapse = changeInMicrotime($interim_time); if ($interim_elapse > 5) { - crawlLog("..Inverting " . $link_to . $site_url . + crawlLog("..Indexer Inverting " . $link_to . $site_url . "...took > 5s."); } - if (crawlTimeoutLog("..Still building inverted index. Have ". - "processed %s of %s documents.\n" . - "Total links or docs processed is %s.\n" . - "Last url processed was %s.", + if (crawlTimeoutLog("..Indexer Still building inverted index.". + "\n....Have processed %s of %s documents.\n" . + "....Total links or docs processed is %s.\n" . + "....Last url processed was %s.", $non_aux_doc_cnt, $num_partition, $non_aux_doc_cnt + $cnt, $link_to . $site_url) && $taking_too_long_touch) { if (file_exists($taking_too_long_touch)) { @@ -609,7 +625,7 @@ class IndexDocumentBundle implements CrawlConstants $postings_tools->save($postings_filename, $postings); $last_entries_tools->save($last_entries_filename, $last_entries); file_put_contents($positions_filename, $positions); - crawlLog(" Build inverted index time ". + crawlLog(" Indexer build inverted index time ". changeInMicrotime($start_time)); return true; } diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php index 8eb4f1c4f..5f357cf34 100644 --- a/src/library/IndexManager.php +++ b/src/library/IndexManager.php @@ -70,6 +70,12 @@ class IndexManager implements CrawlConstants public static function getIndex($index_name) { $index_name = trim($index_name); //trim to fix postgres quirkiness + if (!empty(self::$index_times[$index_name]) && + ($index_name != 'feed' || php_sapi_name() != 'cli' || + (time() - self::$index_times[$index_name]) + < C\MIN_QUERY_CACHE_TIME) ) { + return self::$indexes[$index_name]; + } $cache_dir = C\CRAWL_DIR . '/cache/'; $index_archive_full_path = $cache_dir . self::index_data_base_name . $index_name; @@ -109,74 +115,68 @@ class IndexManager implements CrawlConstants } else if (!$handled) { return false; } - if (empty(self::$indexes[$index_name]) || - (!empty(self::$index_times[$index_name]) && - ($index_name == 'feed' && php_sapi_name() == 'cli') && - (time() - self::$index_times[$index_name]) - > C\MIN_QUERY_CACHE_TIME) ) { - if (!isset($tmp)) { - $index_filename = $cache_dir . $index_archive_name; - if (file_exists($index_filename)) { - $is_old = (file_exists($index_filename . "/summaries")); - $bundle_class_name = ($is_old) ? C\NS_LIB . - "IndexArchiveBundle" : C\NS_LIB . "IndexDocumentBundle"; - $tmp = new $bundle_class_name($cache_dir . - $index_archive_name); - if (!$tmp) { - return false; - } - } else { - $tmp = false; - $use_name = $index_name; - $serve_archive = -1; - if (preg_match("/\-\d$/", $index_name)) { - $serve_archive = substr($index_name, -1); - $use_name = substr($index_name, 0, -2); - } - $index_archive_name = self::double_index_base_name . - $use_name; - $status_file = $cache_dir . $index_archive_name . - "/status.txt"; - if ($serve_archive < 0 && file_exists($status_file)) { - $status = unserialize(file_get_contents($status_file)); - $active_archive = (empty($status["swap_count"])) ? 1 : - $status["swap_count"] % 2; - $serve_archive = 1 - $active_archive; - } - $is_old = (file_exists($index_filename . - "/bundle0/summaries") || - file_exists($index_filename . "/bundle1/summaries")); - $bundle_class_name = ($is_old) ? - C\NS_LIB . "IndexArchiveBundle" - : C\NS_LIB . "IndexDocumentBundle"; + if (!isset($tmp)) { + $index_filename = $cache_dir . $index_archive_name; + if (file_exists($index_filename)) { + $is_old = (file_exists($index_filename . "/summaries")); + $bundle_class_name = ($is_old) ? C\NS_LIB . + "IndexArchiveBundle" : C\NS_LIB . "IndexDocumentBundle"; + $tmp = new $bundle_class_name($cache_dir . + $index_archive_name); + if (!$tmp) { + return false; + } + } else { + $tmp = false; + $use_name = $index_name; + $serve_archive = -1; + if (preg_match("/\-\d$/", $index_name)) { + $serve_archive = substr($index_name, -1); + $use_name = substr($index_name, 0, -2); + } + $index_archive_name = self::double_index_base_name . + $use_name; + $status_file = $cache_dir . $index_archive_name . + "/status.txt"; + if ($serve_archive < 0 && file_exists($status_file)) { + $status = unserialize(file_get_contents($status_file)); + $active_archive = (empty($status["swap_count"])) ? 1 : + $status["swap_count"] % 2; + $serve_archive = 1 - $active_archive; + } + $is_old = (file_exists($index_filename . + "/bundle0/summaries") || + file_exists($index_filename . "/bundle1/summaries")); + $bundle_class_name = ($is_old) ? + C\NS_LIB . "IndexArchiveBundle" + : C\NS_LIB . "IndexDocumentBundle"; + $tmp = new $bundle_class_name($cache_dir . + $index_archive_name . "/bundle$serve_archive"); + if (!$tmp) { + $serve_archive = ($serve_archive == 0) ? 1 : 0; $tmp = new $bundle_class_name($cache_dir . $index_archive_name . "/bundle$serve_archive"); - if (!$tmp) { - $serve_archive = ($serve_archive == 0) ? 1 : 0; - $tmp = new $bundle_class_name($cache_dir . - $index_archive_name . "/bundle$serve_archive"); - } - if (!$tmp) { - return false; - } + } + if (!$tmp) { + return false; } } - self::$indexes[$index_name] = $tmp; - if ($is_old) { - self::$indexes[$index_name]->setCurrentShard(0, true); - } - self::$index_times[$index_name] = time(); - /* - If too many cached discard oldest 1/3 of cached indices - */ - if (count(self::$indexes) > self::INDEX_CACHE_SIZE) { - $times = array_values(self::$index_times); - sort($times); - $oldest_third = $times[floor(count($times)/3)]; - foreach (self::$index_times as $name => $time) { - if ($time <= $oldest_third) { - unset(self::$index_times[$name], self::$indexes[$name]); - } + } + self::$indexes[$index_name] = $tmp; + if ($is_old) { + self::$indexes[$index_name]->setCurrentShard(0, true); + } + self::$index_times[$index_name] = time(); + /* + If too many cached discard oldest 1/3 of cached indices + */ + if (count(self::$indexes) > self::INDEX_CACHE_SIZE) { + $times = array_values(self::$index_times); + sort($times); + $oldest_third = $times[floor(count($times)/3)]; + foreach (self::$index_times as $name => $time) { + if ($time <= $oldest_third) { + unset(self::$index_times[$name], self::$indexes[$name]); } } } diff --git a/src/library/index_bundle_iterators/DocIterator.php b/src/library/index_bundle_iterators/DocIterator.php index c17d4735e..e6368e564 100755 --- a/src/library/index_bundle_iterators/DocIterator.php +++ b/src/library/index_bundle_iterators/DocIterator.php @@ -180,7 +180,8 @@ class DocIterator extends IndexBundleIterator $doc_map_filename = $base_folder . "/" . IndexDocumentBundle::DOC_MAP_FILENAME; $doc_map_tools = $index->doc_map_tools; - $this->doc_map = $doc_map_tools->load($doc_map_filename); + $this->doc_map = $doc_map_tools->load($doc_map_filename) + ?? []; $doc_keys = array_keys($this->doc_map); $key_index = []; foreach ($this->doc_map as $key => $entry) { diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php index 87c67cb53..91a72366b 100644 --- a/src/library/index_bundle_iterators/WordIterator.php +++ b/src/library/index_bundle_iterators/WordIterator.php @@ -732,7 +732,7 @@ class WordIterator extends IndexBundleIterator $this->dictionary_info[$this->generation_pointer]; $this->current_generation = $partition_info['PARTITION']; $this->start_offset = 0; - $this->last_offset = $partition_info['NUM_DOCS'] - 1; + $this->last_offset = ($partition_info['NUM_DOCS'] ?? 1) - 1; } $this->current_offset = ($is_ascending) ? $this->start_offset: $this->last_offset; diff --git a/tests/IndexDocumentBundleTest.php b/tests/IndexDocumentBundleTest.php index 254ef4148..9ea2e35af 100644 --- a/tests/IndexDocumentBundleTest.php +++ b/tests/IndexDocumentBundleTest.php @@ -315,14 +315,31 @@ use seekquarry\yioop\library\UnitTest; } } /** + * Computes a 24 byte docId by padding an int to the left with 0's * + * @param int $i integer to make docId from + * @return string docid made by padding */ protected function docidFromInt($i) { return str_pad("$i", 24, "0", STR_PAD_LEFT); } /** + * docids are typically made from three 8byte strings. This function + * takes three ints and left pads each with '0' (\x30) and concatenates + * then to make a 24 byte docid. As docids use their 8 byte to say whether + * the id is for a document (replace with 'd') or a link (replace with 'l') + * this function uses the value of the $is_doc flag to determine which value + * overwrite the 8th byte with. * + * @param int $i_hash_url an int for first 8 bytes (in non-artificial docids + * would be for the crawlHash of url document from) + * @param int $j_hash_page an int for first 8 bytes (in non-artificial + * docids would be for the crawlHash of document) + * @param int $k_hash_host an int for first 8 bytes (in non-artificial + * docids would be for the crawlHash of hostname of site document from) + * @param bool $is_doc whether the hash is for a document or a link + * @return string 24 byte docid. */ protected function docidFromIntKeys($i_hash_url, $j_hash_page, $k_hash_host, $is_doc = true) diff --git a/tests/IndexManagerTest.php b/tests/IndexManagerTest.php index 33caf7312..138bd2acf 100644 --- a/tests/IndexManagerTest.php +++ b/tests/IndexManagerTest.php @@ -41,7 +41,8 @@ use seekquarry\yioop\library\IndexManager; use seekquarry\yioop\library\UnitTest; /** - * + * Used to run unit tests for the IndexManager class. IndexManager acts a + * a resource manager for the open indexes used to process a query. */ class IndexManagerTest extends UnitTest { @@ -57,10 +58,9 @@ use seekquarry\yioop\library\UnitTest; * Prefix of folders for index manager test */ const TEST_DIR = __DIR__ . '/test_files/index_manager_test'; - /** - * Sets up an array to keep track of what linear hash tables we've made - * so that we can delete them when done a test. + * Before each test sets up a test directory with an old and new bundle in + * it */ public function setUp() { @@ -81,7 +81,7 @@ use seekquarry\yioop\library\UnitTest; "/$new_bundle", false, "TestBundle", 3, 5); } /** - * + * Used after each test to delete the test directory */ public function tearDown() { @@ -89,7 +89,8 @@ use seekquarry\yioop\library\UnitTest; $model->db->unlinkRecursive(self::TEST_DIR); } /** - * + * Test case if IndexManager can determine the type and version of an index + * to be able to return an instantiated version fo that index. */ public function getIndexTestCase() { @@ -101,7 +102,8 @@ use seekquarry\yioop\library\UnitTest; $this->assertTrue($new_index, "Get existing new bundle not null"); } /** - * + * Test case used to test whether or not the index manager can determine + * the version of a Yioop index. */ public function getVersionTestCase() { @@ -113,7 +115,8 @@ use seekquarry\yioop\library\UnitTest; $this->assertEqual($version_new, 3, "Version 3 index detected"); } /** - * + * Tests if IndexManager can return the dictionary information about a + * word stored in an index it manages. */ public function getWordInfoTestCase() { @@ -166,7 +169,21 @@ use seekquarry\yioop\library\UnitTest; return str_pad("$i", 24, "0", STR_PAD_LEFT); } /** + * docids are typically made from three 8byte strings. This function + * takes three ints and left pads each with '0' (\x30) and concatenates + * then to make a 24 byte docid. As docids use their 8 byte to say whether + * the id is for a document (replace with 'd') or a link (replace with 'l') + * this function uses the value of the $is_doc flag to determine which value + * overwrite the 8th byte with. * + * @param int $i_hash_url an int for first 8 bytes (in non-artificial docids + * would be for the crawlHash of url document from) + * @param int $j_hash_page an int for first 8 bytes (in non-artificial + * docids would be for the crawlHash of document) + * @param int $k_hash_host an int for first 8 bytes (in non-artificial + * docids would be for the crawlHash of hostname of site document from) + * @param bool $is_doc whether the hash is for a document or a link + * @return string 24 byte docid. */ protected function docidFromIntKeys($i_hash_url, $j_hash_page, $k_hash_host, $is_doc = true)