<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2023 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * This file contains unit tests of the IndexDocumentBundleTable class * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2023 * @filesource */ namespace seekquarry\yioop\tests; use seekquarry\yioop\configs as C; use seekquarry\yioop\library as L; use seekquarry\yioop\models\Model; use seekquarry\yioop\library\CrawlConstants as CC; use seekquarry\yioop\library\IndexDocumentBundle; use seekquarry\yioop\library\IndexManager; use seekquarry\yioop\library\UnitTest; use seekquarry\yioop\library\index_bundle_iterators\WordIterator; /** * Tests the functionality of the WordIterator class used to iterate over * documents in an IndexDocumentBundle containing a term. */ class WordIteratorTest extends UnitTest { /** * Name of IndexDocumentBundle used to store test documents */ const TEST_BUNDLE = "IndexData200000000"; /** * Prefix of folders for index manager test */ const TEST_DIR = __DIR__ . '/test_files/word_iterator_test'; /** * IndexDocumentBundle used to store test documents * @var object */ public $index_archive; /** * Sets up test dir for document bunble then instantiates * IndexDocumentBundle */ public function setUp() { $test_bundle = self::TEST_BUNDLE; if (!file_exists(self::TEST_DIR)) { mkdir(self::TEST_DIR); } $this->index_archive = new IndexDocumentBundle(self::TEST_DIR . "/$test_bundle", false, "TestBundle", 100, 101); } /** * Deletes test directory and test document bundle */ public function tearDown() { $model = new Model(); $model->db->unlinkRecursive(self::TEST_DIR); } /** * Sets of a IndexDocumentBundle storing 10 documents. The word 'be' * appears in every document and the words or1, or2, ..., or10 each * appear in 1 document. The test then checks the number of documents * returns by the WordIterator's currentDocsWithWord method to see if * this is the case. */ public function findTermTestCase() { IndexManager::clearCache(); $index_archive = $this->index_archive; $dictionary = $index_archive->dictionary; $keys = []; for ($i = 0; $i < 10; $i++) { $keys[$i] = $this->docidFromIntKeys($i, $i, $i); $docs[] = [ CC::DOC_ID => $keys[$i], CC::SUMMARY => [ CC::DESCRIPTION => "to$i be or$i not$i to$i be...", CC::HASH => str_pad("$i", 8, "0", STR_PAD_LEFT), CC::TITLE => "Some$i Shakespeare$i Play$i", CC::URL => "https://www.somewhere$i.com/" ], CC::PAGE => "Page $i", ]; } $num_docs = count($docs); $index_archive->addPages($docs, $num_docs); $index_archive->updateDictionary(); $index_archive->forceSave(); $word_iterator = new WordIterator(L\canonicalTerm("be"), self::TEST_DIR . "/" . self::TEST_BUNDLE, true, null, 10); $current_doc_with_word = $word_iterator->currentDocsWithWord(); $current_doc_with_word = (is_array($current_doc_with_word)) ? $current_doc_with_word : []; $this->assertEqual(count($current_doc_with_word), 10, "Number of occurrences of 'be' correct"); for ($i = 0; $i < 10; $i++) { $word_iterator = new WordIterator(L\canonicalTerm("or$i"), self::TEST_DIR . "/". self::TEST_BUNDLE, true, null, 10); $current_doc_with_word = $word_iterator->currentDocsWithWord(); $current_doc_with_word = (is_array($current_doc_with_word)) ? $current_doc_with_word : []; $this->assertEqual(count($current_doc_with_word), 1, "Number of occurrences of 'or$i' correct"); } } /** * Auxiliary method used to create a document id for a test document * @param int $i_hash_url an integer that is supposed to be used * to correspond to the hash of a url component of a doc_id * @param int $j_hash_page an integer that is supposed to be used * to correspond to the hash of a web page component of a doc_id * @param int $k_hash_host an integer that is supposed to be used * to correspond to the hash of a url's host component of a doc_id * @param bool $is_doc whether to make a doc_id for a link to doc item or * for a document * @return string doc_id to associate with that integer */ protected function docidFromIntKeys($i_hash_url, $j_hash_page, $k_hash_host, $is_doc = true) { $doc_or_link = ($is_doc) ? "d" : "l"; return str_pad("$i_hash_url", 8, "0", STR_PAD_LEFT) . str_pad("$j_hash_page", 8, "0", STR_PAD_LEFT) . $doc_or_link . str_pad("$k_hash_host", 7, "0", STR_PAD_LEFT); } }