Last commit for tests/IndexManagerTest.php: afd6930f42e31d81a53d42061b5fd758f56c62de

First pass at modifying Yioop to again use Logarithmic Merge Trees for its dictionary structures

Chris Pollett [2024-01-15 02:Jan:th]
First pass at modifying Yioop to again use Logarithmic Merge Trees for its dictionary structures
Folder structure of IndexDocumentBundles also modified and now supports overflow folder (which
could be on a different hard drive). ArcTool has been updated to support migration to new
indexes
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2022  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * This file contains unit tests of the IndexDocumentBundleTable class
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2022
 * @filesource
 */
namespace seekquarry\yioop\tests;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\models\Model;
use seekquarry\yioop\library\CrawlConstants as CC;
use seekquarry\yioop\library\IndexDocumentBundle;
use seekquarry\yioop\library\IndexManager;
use seekquarry\yioop\library\UnitTest;

/**
 * Used to run unit tests for the IndexManager class. IndexManager acts a
 * a resource manager for the open indexes used to process a query.
 */
 class IndexManagerTest extends UnitTest
{
    /**
     * Name of the included pre version3 Index bundle to test
     */
    const OLD_BUNDLE = "IndexData1421025145";
    /**
     * Name of the included version3 Index bundle to test
     */
    const NEW_BUNDLE = "IndexData200000000";
    /**
     * Prefix of folders for index manager test
     */
    const TEST_DIR = __DIR__ . '/test_files/index_manager_test';
    /**
     * Before each test sets up a test directory with an old and new bundle in
     * it
     */
    public function setUp()
    {
        $old_bundle = self::OLD_BUNDLE;
        $new_bundle = self::NEW_BUNDLE;
        if (!file_exists(self::TEST_DIR)) {
            mkdir(self::TEST_DIR);
        }
        $this->index_archive = new IndexDocumentBundle(self::TEST_DIR .
            "/$new_bundle", false, "TestBundle", 3, 5);
    }
    /**
     * Used after each test to delete the test directory
     */
    public function tearDown()
    {
        $model = new Model();
        $model->db->unlinkRecursive(self::TEST_DIR);
    }
    /**
     * Test case if IndexManager can determine the type and version of an index
     * to be able to return an instantiated version fo that index.
     */
    public function getIndexTestCase()
    {
        $new_index = IndexManager::getIndex(self::TEST_DIR . "/".
            self::NEW_BUNDLE);
        $this->assertTrue($new_index, "Get existing new bundle not null");
    }
    /**
     * Test case used to test whether or not the index manager can determine
     * the version of a Yioop index.
     */
    public function getVersionTestCase()
    {
        $version_new = IndexManager::getVersion(self::TEST_DIR . "/".
            self::NEW_BUNDLE);
        $this->assertEqual($version_new, "3.2", "Version 3.2 index detected");
    }
    /**
     * Tests if IndexManager can return the dictionary information about a
     * word stored in an index it manages.
     */
    public function getWordInfoTestCase()
    {
        IndexManager::clearCache();
        $index_archive = $this->index_archive;
        $dictionary = $index_archive->dictionary;
        $keys = [];
        $a = "";
        for ($i = 0; $i < 10; $i++) {
            $keys[$i] = $this->docidFromIntKeys($i, $i, $i);
            $docs[] = [
                CC::DOC_ID => $keys[$i],
                CC::SUMMARY =>
                    [
                        CC::DESCRIPTION => "$a to$i be or$i not$i to$i be...",
                        CC::HASH => str_pad("$i", 8, "0", STR_PAD_LEFT),
                        CC::TITLE => "Some$i Shakespeare$i Play$i",
                        CC::URL => "https://www.somewhere$i.com/"
                    ],
                CC::PAGE => "Page $i",
            ];
            if ($i > 3) {
                $a = "aha be ";
            }
        }
        $num_docs = count($docs);
        $index_archive->addPages($docs, $num_docs);
        $index_archive->updateDictionary();
        $index_archive->forceSave();
        $be_info = IndexManager::getWordInfo(self::TEST_DIR . "/".
            self::NEW_BUNDLE, L\canonicalTerm("be"));
        $this->assertEqual(count($be_info['ROWS']), 3,
            "The word 'be' occurs in three partitions");
        $this->assertTrue(is_array($be_info['ROWS'][2]['POSTINGS']),
            "Active partition postings has been returned as array.");
        $index = IndexManager::getIndex(self::TEST_DIR . "/".
            self::NEW_BUNDLE);
        $postings = $index->getPostingsString(0,
            $be_info['ROWS'][0]['POSTINGS_OFFSET'],
            $be_info['ROWS'][0]['POSTINGS_LEN']);
        $posting_rows = $index->postings_tools->unpack($postings);
        $this->assertEqual(count($posting_rows), 4,
            "Able to look up postings for Partition 0 of 'be' Word Info.");
    }
    /**
     * Computes a 24 byte docId by padding an int to the left with 0's
     *
     * @param int $i integer to make docId from
     * @return string docid made by padding
     */
    protected function docidFromInt($i)
    {
        return str_pad("$i", 24, "0", STR_PAD_LEFT);
    }
    /**
     * docids are typically made from three 8byte strings. This function
     * takes three ints and left pads each with '0' (\x30) and concatenates
     * then to make a 24 byte docid. As docids use their 8 byte to say whether
     * the id is for a document (replace with 'd') or a link (replace with 'l')
     * this function uses the value of the $is_doc flag to determine which value
     * overwrite the 8th byte with.
     *
     * @param int $i_hash_url an int for first 8 bytes (in non-artificial docids
     *      would be for the crawlHash of url document from)
     * @param int $j_hash_page an int for first 8 bytes (in non-artificial
     *      docids would be for the crawlHash of document)
     * @param int $k_hash_host an int for first 8 bytes (in non-artificial
     *      docids would be for the crawlHash of hostname of site document from)
     * @param bool $is_doc whether the hash is for a document or a link
     * @return string 24 byte docid.
     */
    protected function docidFromIntKeys($i_hash_url, $j_hash_page,
        $k_hash_host, $is_doc = true)
    {
            $doc_or_link = ($is_doc) ? "d" : "l";
            return str_pad("$i_hash_url", 8, "0", STR_PAD_LEFT) .
                str_pad("$j_hash_page", 8, "0", STR_PAD_LEFT) .
                $doc_or_link .
                str_pad("$k_hash_host", 7, "0", STR_PAD_LEFT);
    }
}
ViewGit