Last commit for tests/IndexDocumentBundleTest.php: afd6930f42e31d81a53d42061b5fd758f56c62de

First pass at modifying Yioop to again use Logarithmic Merge Trees for its dictionary structures

Chris Pollett [2024-01-15 02:Jan:th]

First pass at modifying Yioop to again use Logarithmic Merge Trees for its dictionary structures
Folder structure of IndexDocumentBundles also modified and now supports overflow folder (which
could be on a different hard drive). ArcTool has been updated to support migration to new
indexes

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2024  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * This file contains unit tests of the IndexDocumentBundleTable class
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2024
 * @filesource
 */
namespace seekquarry\yioop\tests;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\models\Model;
use seekquarry\yioop\library\CrawlConstants as CC;
use seekquarry\yioop\library\IndexDocumentBundle;
use seekquarry\yioop\library\UrlParser;
use seekquarry\yioop\library\UnitTest;

/**
 * Used to test that the IndexDocumentBundle class can properly add and
 * retrieve documents. Check its prepareMethod correctly deduplicates
 * documents before inverted index creation. Tests inverted index creation
 * and adding terms to IndexDocumentBundle's BPlusTree. Check look up of
 * documents according to term.
 */
 class IndexDocumentBundleTest extends UnitTest
{
    /**
     * Prefix of folders for index document test
     */
    const TEST_DIR = __DIR__ . '/test_files/index_document_test';
    /**
     *
     */
    const TEST_DOC_TYPE = "\x10"; //link type;
    /**
     * Holds the IndexDocumentBundle used for test purposes
     * @var IndexDocumentBundle
     */
    public $index_archive;
    /**
     * Sets up an array to keep track of what linear hash tables we've made
     * so that we can delete them when done a test.
     */
    public function setUp()
    {
        $num_docs_per_partition = ($this->current_method ==
            "addPartitionPostingsDictionaryTestCase") ? 3 :
            C\NUM_DOCS_PER_PARTITION;
        $this->index_archive = new IndexDocumentBundle(self::TEST_DIR,
            false, "TestBundle", $num_docs_per_partition, 5);
    }
    /**
     * Deletes all the Linear Hash tables in $this->table_dirs
     */
    public function tearDown()
    {
        $model = new Model();
        $model->db->unlinkRecursive(self::TEST_DIR);
    }
    /**
     * Checks if the constructor of the IndexDocumentBundle correctly save
     * the constructor info such as the bundle description
     */
    public function saveDescriptionTestCase()
    {
        $archive_info = IndexDocumentBundle::getArchiveInfo(self::TEST_DIR);
        $this->assertEqual($archive_info["DESCRIPTION"], "TestBundle");
    }
    /**
     * Tests that after adding pages to an IndexArchiveBundle, the page,
     * and its summary can be retrieved.
     */
    public function addGetPagesTestCase()
    {
        $docs = [];
        for ($i = 0; $i < 10; $i++)
        {
            $docs[] = [CC::DOC_ID => $this->docidFromInt($i),
                CC::SUMMARY =>[CC::TITLE => "title $i",
                CC::DESCRIPTION => "desc $i"],
                CC::PAGE => "page $i"];
        }
        $this->index_archive->addPages($docs, 10);
        for ($i = 0; $i < 10; $i++)
        {
            $summary =
                $this->index_archive->getSummary($this->docidFromInt($i), 0);
            $this->assertEqual($summary, $docs[$i][CC::SUMMARY],
                "Summary Test $i");
            $page =
                $this->index_archive->getCachePage($this->docidFromInt($i), 0);
            $this->assertEqual($page, $docs[$i][CC::PAGE],
                "Page Test $i");
        }
    }
    /**
     * Tests the prepareIndexMap method which is used to deduplicate pages
     * before an inverted index of a partition is made. Tests adding pages
     * pages with the same doc_id to make sure will get grouped together
     * Grouping also affect how documents are scored so tests this as well.
     */
    public function prepareIndexTestCase()
    {
        $docs = [];
        for ($i = 260; $i > 250; $i--)
        {
            $docs[] = [CC::DOC_ID => $this->docidFromInt($i),
                CC::SUMMARY =>[CC::TITLE => "title $i",
                CC::DESCRIPTION => "desc $i"],
                CC::PAGE => "page $i"];
        }
        $this->index_archive->addPages($docs, 10);
        $index = $this->index_archive->documents->loadPartitionIndex(0, true);
        $i = 0;
        foreach ($index as $test_doc_id => $row)
        {
            $this->assertEqual($test_doc_id, $docs[$i][CC::DOC_ID],
                "Key Partition Decode Test ". $i);
            $i++;
        }
        $grouped_urls = $this->index_archive->prepareIndexMap(0);
        /* hash_url component of doc_key is first eight bytes
           in above should all be same, so will be grouped into one group
         */
        $this->assertEqual(count($grouped_urls), 1,
            "Grouping by hash url works count test");
        /* none of the hash_page components of doc_ids above begin with d
           will all be considered aux_keys (not main doc)
         */
        $this->assertEqual(substr_count($grouped_urls['00000000'][CC::AUX_DOCS],
            "\xFF"), 10, "Grouping by hash url works count aux_docs test");
        /* the score of a doc_id is initial its position in key_partition.
            groups have scores equal to the sum of the grouped doc_key's
            scores. In this case for 10 urls have 1+2+ ... + 10 = 55
         */
        $this->assertEqual($grouped_urls['00000000'][CC::SCORE], 55,
            "Grouping by hash url works count score test");
        $test_index = [];
        $i = 1;
        while ($i <= 100) {
            /* every tenth $i should share a hash for both a doc and a link
               given out scoring system the first 10 of these should have
               the highest scores and be chosen as the "official" docs for
               the grouping.
             */
            for ($j = 1; $j <= 10; $j++) {
                $test_index[$this->docidFromIntKeys($i, $j,
                    $j, false)] = "lnk";
                $test_index[$this->docidFromIntKeys($i, $j,
                    $j, true)] = "doc";
                $i++;
            }
        }
        $grouped_urls = $this->index_archive->prepareIndexMap(0,
            $test_index);
        foreach ($grouped_urls as $hash_url => $group) {
            $hash_int = intval($hash_url);
            if ($hash_int <= 10) {
                $this->assertTrue(substr_count(
                    $group[CC::AUX_DOCS], "\xFF") > 0,
                    "Nonempty group case multiple same hash_page case $i");
            } else {
                $this->assertEqual($group[CC::AUX_DOCS], "metas_only",
                    "Nonempty group case multiple same hash_page case $i");
            }
        }
    }
    /**
     * Tests the process of added documents to the IndexDocumentBundle, then
     * building an inverted index from this.  To check after the above is
     * done perform lookup's of terms known to have posting list
     * and then checking the properties of the returned posting lists.
     */
    public function buildInvertedIndexPartitionTestCase()
    {
        $keys = [$this->docidFromIntKeys(1, 1, 1),
            $this->docidFromIntKeys(2, 2, 2)];
        $docs = [
            [
                CC::DOC_ID => $keys[0],
                CC::SUMMARY =>
                    [
                        CC::DESCRIPTION => "To be or not to be...",
                        CC::HASH => str_pad("1", 8, "0", STR_PAD_LEFT),
                        CC::TITLE => "Some Shakespeare Play",
                        CC::URL => "https://www.somewhere1.com/"
                    ],
                CC::PAGE => "Page 1",
            ],
            [
                CC::DOC_ID => $keys[1],
                CC::SUMMARY =>
                    [
                        CC::DESCRIPTION => "Take me out to the ball game...",
                        CC::HASH => str_pad("2", 8, "0", STR_PAD_LEFT),
                        CC::TITLE => "A Dialog on Baseball for people now",
                        CC::URL => "https://www.somewhere2.com/"
                    ],
                CC::PAGE => "Page 2",
            ]
        ];
        $this->index_archive->addPages($docs, 2);
        $this->index_archive->buildInvertedIndexPartition();
        $base_folder = $this->index_archive->getPartitionBaseFolder(0);
        $doc_map_filename = $base_folder . "/" .
            IndexDocumentBundle::DOC_MAP_FILENAME;
        $doc_map_tools = $this->index_archive->doc_map_tools;
        $doc_map = $doc_map_tools->load($doc_map_filename);
        $i = 0;
        foreach ($keys as $key) {
            $row = $doc_map_tools->find($doc_map, $key);
            //get row after bloom filter of terms
            $row = substr($row, IndexDocumentBundle::TERMSFILTER_LEN + 1);
            $entry = $doc_map_tools->unpack($row);
            $preface_length = str_word_count(UrlParser::getWordsInHostUrl(
                    $docs[$i][CC::SUMMARY][CC::URL]) . " " .
                    $docs[$i][CC::SUMMARY][CC::TITLE]) + 3;
            $entry_preface_length = ($entry[1]["POS"] & 255);
            $this->assertEqual($preface_length, $entry_preface_length,
                "Doc $i preface length matches calculated.");
            $i++;
        }
        $posting_tools = $this->index_archive->postings_tools;
        $postings_filename = $base_folder . "/" .
            IndexDocumentBundle::POSTINGS_FILENAME;
        $postings = $posting_tools->load($postings_filename);
        $stemmed_ball = L\PhraseParser::stemTerms("baseball", 'en-US')[0];
        $baseball_id = L\canonicalTerm($stemmed_ball, true);
        $row = $posting_tools->find($postings, $baseball_id);
        $entry = $posting_tools->unpack($row);
        $this->assertEqual($entry[0]['FREQUENCY'], 1,
            "Test retrieve posting frequency");
        $positions_filename = $base_folder . "/" .
            IndexDocumentBundle::POSITIONS_FILENAME;
        $encoded_positions = file_get_contents($positions_filename,
            false, null, $entry[0]['POSITIONS_OFFSET'],
            $entry[0]['POSITIONS_LEN']);
        $position_list = L\decodePositionList($encoded_positions,
            $entry[0]['FREQUENCY']);
        $host_word_count = str_word_count(UrlParser::getWordsInHostUrl(
            $docs[1][CC::SUMMARY][CC::URL]));
        $this->assertEqual($position_list[0], 1 + $host_word_count + 1 + 3,
            "Test Position List Decode");
    }
    /**
     * Tests the complete process of going for documents, dedup,
     * building an inverted index and adding the result to the
     * IndexDocumentBundle's inverted index.  To this after the above is
     * done perform lookup's of terms known to be in the indexed documents
     * and check the properties of the returned posting lists.
     */
    public function addPartitionPostingsDictionaryTestCase()
    {
        $index_archive = $this->index_archive;
        $dictionary = $index_archive->dictionary;
        $keys = [];
        for ($i = 0; $i < 10; $i++) {
            $keys[$i] = $this->docidFromIntKeys($i, $i, $i);
            $docs[] = [
                CC::DOC_ID => $keys[$i],
                CC::SUMMARY =>
                    [
                        CC::DESCRIPTION => "to$i be or$i not$i to$i be...",
                        CC::HASH => str_pad("$i", 8, "0", STR_PAD_LEFT),
                        CC::TITLE => "Some$i Shakespeare$i Play$i",
                        CC::URL => "https://www.somewhere$i.com/"
                    ],
                CC::PAGE => "Page $i",
            ];
        }
        $num_docs = count($docs);
        $index_archive->addPages($docs, $num_docs);
        $index_archive->updateDictionary();
        $index_archive->forceSave();
        $be_term_id = L\canonicalTerm("be");
        $term_row = $dictionary->get($be_term_id);
        $this->assertEqual(count($term_row), 2, "Two completed partitions");
        $active_folder = $this->index_archive->getPartitionBaseFolder(
            $this->index_archive->documents->parameters["SAVE_PARTITION"]);
        $active_postings_filename = $active_folder . "/postings";
        $posting_tools = $this->index_archive->postings_tools;
        $this->assertTrue(file_exists($active_postings_filename),
            "Active postings file exists");
        $active_postings = [];
        if (file_exists($active_postings_filename)) {
            $active_dictionary = $posting_tools->load(
                $active_postings_filename);
            $active_term_row = $posting_tools->find($active_dictionary,
                $be_term_id);
            $active_postings = $posting_tools->unpack($active_term_row);
        }
        $sum = 0;
        for ($i = 0; $i < 2; $i++) {
            $sum += ($term_row[$i]['NUM_DOCS'] ?? 0);
        }
        $this->assertEqual($sum + count($active_postings), $num_docs,
            "Term 'be' occurs in correct number of documents");
        for ($i = 0; $i < 2; $i++) {
            $partition = $term_row[$i]['PARTITION'];
            $partition_folder = $this->index_archive->getPartitionBaseFolder(
                $partition);
            $postings_filename = $partition_folder . "/" .
                IndexDocumentBundle::POSTINGS_FILENAME;
            $postings_string = file_get_contents($postings_filename,
                false, null, $term_row[$i]['POSTINGS_OFFSET'],
                $term_row[$i]['POSTINGS_LEN']);
            $postings = $posting_tools->unpack($postings_string);
            $base_folder = $this->index_archive->getPartitionBaseFolder(
                $term_row[$i]['PARTITION']);
            $positions_filename = $base_folder . "/" .
                IndexDocumentBundle::POSITIONS_FILENAME;
            $last = 0;
            for ($j = 0; $j < 4; $j++) {
                $encoded_positions = file_get_contents($positions_filename,
                    false, null, $last + $postings[$j]['POSITIONS_OFFSET'],
                    $postings[$j]['POSITIONS_LEN']);
                $last += $postings[$j]['POSITIONS_OFFSET'];
                $position_list = L\decodePositionList($encoded_positions,
                    $postings[$j]['FREQUENCY']);
                $this->assertEqual($position_list[0], 9,
                    "Test Position List Decode");
            }
        }
    }
    /**
     * Computes a 24 byte docId by padding an int to the left with 0's
     *
     * @param int $i integer to make docId from
     * @param string $type default type of docid
     * @return string docid made by padding
     */
    protected function docidFromInt($i, $type = self::TEST_DOC_TYPE)
    {
            $pre_key = str_pad("$i", 24, "0", STR_PAD_LEFT);
            $pre_key[IndexDocumentBundle::DOCID_PART_LEN << 1] = $type;
            return $pre_key;
    }
    /**
     * docids are typically made from three 8byte strings. This function
     * takes three ints and left pads each with '0' (\x30) and concatenates
     * then to make a 24 byte docid. As docids use their 8 byte to say whether
     * the id is for a document (replace with 'd') or a link (replace with 'l')
     * this function uses the value of the $is_doc flag to determine which value
     * overwrite the 8th byte with.
     *
     * @param int $i_hash_url an int for first 8 bytes (in non-artificial docids
     *      would be for the crawlHash of url document from)
     * @param int $j_hash_page an int for first 8 bytes (in non-artificial
     *      docids would be for the crawlHash of document)
     * @param int $k_hash_host an int for first 8 bytes (in non-artificial
     *      docids would be for the crawlHash of hostname of site document from)
     * @param bool $is_doc whether the hash is for a document or a link
     * @return string 24 byte docid.
     */
    protected function docidFromIntKeys($i_hash_url, $j_hash_page,
        $k_hash_host, $is_doc = true)
    {
            $doc_or_link = ($is_doc) ? "d" : "l";
            return str_pad("$i_hash_url", 8, "0", STR_PAD_LEFT) .
                str_pad("$j_hash_page", 8, "0", STR_PAD_LEFT) .
                $doc_or_link . str_pad("$k_hash_host", 7, "0", STR_PAD_LEFT);
    }
}

ViewGit