<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2022 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * This file contains unit tests of the IndexDocumentBundleTable class * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2022 * @filesource */ namespace seekquarry\yioop\tests; use seekquarry\yioop\configs as C; use seekquarry\yioop\library as L; use seekquarry\yioop\models\Model; use seekquarry\yioop\library\CrawlConstants as CC; use seekquarry\yioop\library\IndexDocumentBundle; use seekquarry\yioop\library\IndexManager; use seekquarry\yioop\library\UnitTest; /** * Used to run unit tests for the IndexManager class. IndexManager acts a * a resource manager for the open indexes used to process a query. */ class IndexManagerTest extends UnitTest { /** * Name of the included pre version3 Index bundle to test */ const OLD_BUNDLE = "IndexData1421025145"; /** * Name of the included version3 Index bundle to test */ const NEW_BUNDLE = "IndexData200000000"; /** * Prefix of folders for index manager test */ const TEST_DIR = __DIR__ . '/test_files/index_manager_test'; /** * Before each test sets up a test directory with an old and new bundle in * it */ public function setUp() { $old_bundle = self::OLD_BUNDLE; $new_bundle = self::NEW_BUNDLE; if (!file_exists(self::TEST_DIR)) { mkdir(self::TEST_DIR); } if (!file_exists(self::TEST_DIR . "/$old_bundle")) { $zip_archive = new \ZipArchive(); $success = $zip_archive->open(C\BASE_DIR . "/examples/$old_bundle.zip"); if ($success) { $zip_archive->extractTo(self::TEST_DIR); } } $this->index_archive = new IndexDocumentBundle(self::TEST_DIR . "/$new_bundle", false, "TestBundle", 3, 5); } /** * Used after each test to delete the test directory */ public function tearDown() { $model = new Model(); $model->db->unlinkRecursive(self::TEST_DIR); } /** * Test case if IndexManager can determine the type and version of an index * to be able to return an instantiated version fo that index. */ public function getIndexTestCase() { $old_index = IndexManager::getIndex(self::TEST_DIR . "/". self::OLD_BUNDLE); $this->assertTrue($old_index, "Get existing old bundle not null"); $new_index = IndexManager::getIndex(self::TEST_DIR . "/". self::NEW_BUNDLE); $this->assertTrue($new_index, "Get existing new bundle not null"); } /** * Test case used to test whether or not the index manager can determine * the version of a Yioop index. */ public function getVersionTestCase() { $version_old = IndexManager::getVersion(self::TEST_DIR . "/". self::OLD_BUNDLE); $version_new = IndexManager::getVersion(self::TEST_DIR . "/". self::NEW_BUNDLE); $this->assertEqual($version_old, "1", "Version 1 index detected"); $this->assertEqual($version_new, "3.2", "Version 3.2 index detected"); } /** * Tests if IndexManager can return the dictionary information about a * word stored in an index it manages. */ public function getWordInfoTestCase() { IndexManager::clearCache(); $index_archive = $this->index_archive; $dictionary = $index_archive->dictionary; $keys = []; $a = ""; for ($i = 0; $i < 10; $i++) { $keys[$i] = $this->docidFromIntKeys($i, $i, $i); $docs[] = [ CC::DOC_ID => $keys[$i], CC::SUMMARY => [ CC::DESCRIPTION => "$a to$i be or$i not$i to$i be...", CC::HASH => str_pad("$i", 8, "0", STR_PAD_LEFT), CC::TITLE => "Some$i Shakespeare$i Play$i", CC::URL => "https://www.somewhere$i.com/" ], CC::PAGE => "Page $i", ]; if ($i > 3) { $a = "aha be "; } } $num_docs = count($docs); $index_archive->addPages($docs, $num_docs); $index_archive->updateDictionary(); $index_archive->forceSave(); $be_info = IndexManager::getWordInfo(self::TEST_DIR . "/". self::NEW_BUNDLE, L\canonicalTerm("be")); $this->assertEqual(count($be_info['ROWS']), 3, "The word 'be' occurs in three partitions"); $this->assertTrue(is_array($be_info['ROWS'][2]['POSTINGS']), "Active partition postings has been returned as array."); $index = IndexManager::getIndex(self::TEST_DIR . "/". self::NEW_BUNDLE); $postings = $index->getPostingsString(0, $be_info['ROWS'][0]['POSTINGS_OFFSET'], $be_info['ROWS'][0]['POSTINGS_LEN']); $posting_rows = $index->postings_tools->unpack($postings); $this->assertEqual(count($posting_rows), 4, "Able to look up postings for Partition 0 of 'be' Word Info."); } /** * Computes a 24 byte docId by padding an int to the left with 0's * * @param int $i integer to make docId from * @return string docid made by padding */ protected function docidFromInt($i) { return str_pad("$i", 24, "0", STR_PAD_LEFT); } /** * docids are typically made from three 8byte strings. This function * takes three ints and left pads each with '0' (\x30) and concatenates * then to make a 24 byte docid. As docids use their 8 byte to say whether * the id is for a document (replace with 'd') or a link (replace with 'l') * this function uses the value of the $is_doc flag to determine which value * overwrite the 8th byte with. * * @param int $i_hash_url an int for first 8 bytes (in non-artificial docids * would be for the crawlHash of url document from) * @param int $j_hash_page an int for first 8 bytes (in non-artificial * docids would be for the crawlHash of document) * @param int $k_hash_host an int for first 8 bytes (in non-artificial * docids would be for the crawlHash of hostname of site document from) * @param bool $is_doc whether the hash is for a document or a link * @return string 24 byte docid. */ protected function docidFromIntKeys($i_hash_url, $j_hash_page, $k_hash_host, $is_doc = true) { $doc_or_link = ($is_doc) ? "d" : "l"; return str_pad("$i_hash_url", 8, "0", STR_PAD_LEFT) . str_pad("$j_hash_page", 8, "0", STR_PAD_LEFT) . $doc_or_link . str_pad("$k_hash_host", 7, "0", STR_PAD_LEFT); } }