Last commit for tests/WordIteratorTest.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2022  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * This file contains unit tests of the IndexDocumentBundleTable class
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2022
 * @filesource
 */
namespace seekquarry\yioop\tests;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\models\Model;
use seekquarry\yioop\library\CrawlConstants as CC;
use seekquarry\yioop\library\IndexDocumentBundle;
use seekquarry\yioop\library\IndexManager;
use seekquarry\yioop\library\UnitTest;
use seekquarry\yioop\library\index_bundle_iterators\WordIterator;

/**
 * Tests the functionality of the WordIterator class used to iterate over
 * documents in an IndexDocumentBundle containing a term.
 */
 class WordIteratorTest extends UnitTest
{
    /**
     * Name of IndexDocumentBundle used to store test documents
     */
    const TEST_BUNDLE = "IndexData200000000";
    /**
     * Prefix of folders for index manager test
     */
    const TEST_DIR = __DIR__ . '/test_files/word_iterator_test';
    /**
     * IndexDocumentBundle used to store test documents
     * @var object
     */
    public $index_archive;
    /**
     * Sets up test dir for document bunble then instantiates
     * IndexDocumentBundle
     */
    public function setUp()
    {
        $test_bundle = self::TEST_BUNDLE;
        if (!file_exists(self::TEST_DIR)) {
            mkdir(self::TEST_DIR);
        }
        $this->index_archive = new IndexDocumentBundle(self::TEST_DIR .
            "/$test_bundle", false, "TestBundle", 100, 101);
    }
    /**
     * Deletes test directory and test document bundle
     */
    public function tearDown()
    {
        $model = new Model();
        $model->db->unlinkRecursive(self::TEST_DIR);
    }
    /**
     * Sets of a IndexDocumentBundle storing 10 documents. The word 'be'
     * appears in every document and the words or1, or2, ..., or10 each
     * appear in 1 document. The test then checks the number of documents
     * returns by the WordIterator's currentDocsWithWord method to see if
     * this is the case.
     */
    public function findTermTestCase()
    {
        IndexManager::clearCache();
        $index_archive = $this->index_archive;
        $dictionary = $index_archive->dictionary;
        $keys = [];
        for ($i = 0; $i < 10; $i++) {
            $keys[$i] = $this->docidFromIntKeys($i, $i, $i);
            $docs[] = [
                CC::DOC_ID => $keys[$i],
                CC::SUMMARY =>
                    [
                        CC::DESCRIPTION => "to$i be or$i not$i to$i be...",
                        CC::HASH => str_pad("$i", 8, "0", STR_PAD_LEFT),
                        CC::TITLE => "Some$i Shakespeare$i Play$i",
                        CC::URL => "https://www.somewhere$i.com/"
                    ],
                CC::PAGE => "Page $i",
            ];
        }
        $num_docs = count($docs);
        $index_archive->addPages($docs, $num_docs);
        $index_archive->updateDictionary();
        $index_archive->forceSave();
        $word_iterator = new WordIterator(L\canonicalTerm("be"),
            self::TEST_DIR . "/". self::TEST_BUNDLE, true, null, 10);
        $current_doc_with_word = $word_iterator->currentDocsWithWord();
        $current_doc_with_word = (is_array($current_doc_with_word)) ?
            $current_doc_with_word : [];
        $this->assertEqual(count($current_doc_with_word), 10,
            "Number of occurrences of 'be' correct");
        for ($i = 0; $i < 10; $i++) {
            $word_iterator = new WordIterator(L\canonicalTerm("or$i"),
                self::TEST_DIR . "/". self::TEST_BUNDLE, true, null, 10);
            $current_doc_with_word = $word_iterator->currentDocsWithWord();
            $current_doc_with_word = (is_array($current_doc_with_word)) ?
                $current_doc_with_word : [];
            $this->assertEqual(count($current_doc_with_word), 1,
                "Number of occurrences of 'or$i' correct");
        }
    }
    /**
     * Auxiliary method used to create a document id for a test document
     * @param int $i_hash_url an integer that is supposed to be used
     *      to correspond to the hash of a url component of a doc_id
     * @param int $j_hash_page an integer that is supposed to be used
     *      to correspond to the hash of a web page component of a doc_id
     * @param int $k_hash_host an integer that is supposed to be used
     *      to correspond to the hash of a url's host component of a doc_id
     * @param bool $is_doc whether to make a doc_id for a link to doc item or
     *      for a document
     * @return string doc_id to associate with that integer
     */
    protected function docidFromIntKeys($i_hash_url, $j_hash_page,
        $k_hash_host, $is_doc = true)
    {
            $doc_or_link = ($is_doc) ? "d" : "l";
            return str_pad("$i_hash_url", 8, "0", STR_PAD_LEFT) .
                str_pad("$j_hash_page", 8, "0", STR_PAD_LEFT) .
                $doc_or_link .
                str_pad("$k_hash_host", 7, "0", STR_PAD_LEFT);
    }
}
ViewGit