<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2024 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * This file contains unit tests of the IndexDocumentBundleTable class * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2024 * @filesource */ namespace seekquarry\yioop\tests; use seekquarry\yioop\configs as C; use seekquarry\yioop\library as L; use seekquarry\yioop\models\Model; use seekquarry\yioop\library\CrawlConstants as CC; use seekquarry\yioop\library\IndexDocumentBundle; use seekquarry\yioop\library\UrlParser; use seekquarry\yioop\library\UnitTest; /** * Used to test that the IndexDocumentBundle class can properly add and * retrieve documents. Check its prepareMethod correctly deduplicates * documents before inverted index creation. Tests inverted index creation * and adding terms to IndexDocumentBundle's BPlusTree. Check look up of * documents according to term. */ class IndexDocumentBundleTest extends UnitTest { /** * Prefix of folders for index document test */ const TEST_DIR = __DIR__ . '/test_files/index_document_test'; /** * */ const TEST_DOC_TYPE = "\x10"; //link type; /** * Holds the IndexDocumentBundle used for test purposes * @var IndexDocumentBundle */ public $index_archive; /** * Sets up an array to keep track of what linear hash tables we've made * so that we can delete them when done a test. */ public function setUp() { $num_docs_per_partition = ($this->current_method == "addPartitionPostingsDictionaryTestCase") ? 3 : C\NUM_DOCS_PER_PARTITION; $this->index_archive = new IndexDocumentBundle(self::TEST_DIR, false, "TestBundle", $num_docs_per_partition, 5); } /** * Deletes all the Linear Hash tables in $this->table_dirs */ public function tearDown() { $model = new Model(); $model->db->unlinkRecursive(self::TEST_DIR); } /** * Checks if the constructor of the IndexDocumentBundle correctly save * the constructor info such as the bundle description */ public function saveDescriptionTestCase() { $archive_info = IndexDocumentBundle::getArchiveInfo(self::TEST_DIR); $this->assertEqual($archive_info["DESCRIPTION"], "TestBundle"); } /** * Tests that after adding pages to an IndexArchiveBundle, the page, * and its summary can be retrieved. */ public function addGetPagesTestCase() { $docs = []; for ($i = 0; $i < 10; $i++) { $docs[] = [CC::DOC_ID => $this->docidFromInt($i), CC::SUMMARY =>[CC::TITLE => "title $i", CC::DESCRIPTION => "desc $i"], CC::PAGE => "page $i"]; } $this->index_archive->addPages($docs, 10); for ($i = 0; $i < 10; $i++) { $summary = $this->index_archive->getSummary($this->docidFromInt($i), 0); $this->assertEqual($summary, $docs[$i][CC::SUMMARY], "Summary Test $i"); $page = $this->index_archive->getCachePage($this->docidFromInt($i), 0); $this->assertEqual($page, $docs[$i][CC::PAGE], "Page Test $i"); } } /** * Tests the prepareIndexMap method which is used to deduplicate pages * before an inverted index of a partition is made. Tests adding pages * pages with the same doc_id to make sure will get grouped together * Grouping also affect how documents are scored so tests this as well. */ public function prepareIndexTestCase() { $docs = []; for ($i = 260; $i > 250; $i--) { $docs[] = [CC::DOC_ID => $this->docidFromInt($i), CC::SUMMARY =>[CC::TITLE => "title $i", CC::DESCRIPTION => "desc $i"], CC::PAGE => "page $i"]; } $this->index_archive->addPages($docs, 10); $index = $this->index_archive->documents->loadPartitionIndex(0, true); $i = 0; foreach ($index as $test_doc_id => $row) { $this->assertEqual($test_doc_id, $docs[$i][CC::DOC_ID], "Key Partition Decode Test ". $i); $i++; } $grouped_urls = $this->index_archive->prepareIndexMap(0); /* hash_url component of doc_key is first eight bytes in above should all be same, so will be grouped into one group */ $this->assertEqual(count($grouped_urls), 1, "Grouping by hash url works count test"); /* none of the hash_page components of doc_ids above begin with d will all be considered aux_keys (not main doc) */ $this->assertEqual(substr_count($grouped_urls['00000000'][CC::AUX_DOCS], "\xFF"), 10, "Grouping by hash url works count aux_docs test"); /* the score of a doc_id is initial its position in key_partition. groups have scores equal to the sum of the grouped doc_key's scores. In this case for 10 urls have 1+2+ ... + 10 = 55 */ $this->assertEqual($grouped_urls['00000000'][CC::SCORE], 55, "Grouping by hash url works count score test"); $test_index = []; $i = 1; while ($i <= 100) { /* every tenth $i should share a hash for both a doc and a link given out scoring system the first 10 of these should have the highest scores and be chosen as the "official" docs for the grouping. */ for ($j = 1; $j <= 10; $j++) { $test_index[$this->docidFromIntKeys($i, $j, $j, false)] = "lnk"; $test_index[$this->docidFromIntKeys($i, $j, $j, true)] = "doc"; $i++; } } $grouped_urls = $this->index_archive->prepareIndexMap(0, $test_index); foreach ($grouped_urls as $hash_url => $group) { $hash_int = intval($hash_url); if ($hash_int <= 10) { $this->assertTrue(substr_count( $group[CC::AUX_DOCS], "\xFF") > 0, "Nonempty group case multiple same hash_page case $i"); } else { $this->assertEqual($group[CC::AUX_DOCS], "metas_only", "Nonempty group case multiple same hash_page case $i"); } } } /** * Tests the process of added documents to the IndexDocumentBundle, then * building an inverted index from this. To check after the above is * done perform lookup's of terms known to have posting list * and then checking the properties of the returned posting lists. */ public function buildInvertedIndexPartitionTestCase() { $keys = [$this->docidFromIntKeys(1, 1, 1), $this->docidFromIntKeys(2, 2, 2)]; $docs = [ [ CC::DOC_ID => $keys[0], CC::SUMMARY => [ CC::DESCRIPTION => "To be or not to be...", CC::HASH => str_pad("1", 8, "0", STR_PAD_LEFT), CC::TITLE => "Some Shakespeare Play", CC::URL => "https://www.somewhere1.com/" ], CC::PAGE => "Page 1", ], [ CC::DOC_ID => $keys[1], CC::SUMMARY => [ CC::DESCRIPTION => "Take me out to the ball game...", CC::HASH => str_pad("2", 8, "0", STR_PAD_LEFT), CC::TITLE => "A Dialog on Baseball for people now", CC::URL => "https://www.somewhere2.com/" ], CC::PAGE => "Page 2", ] ]; $this->index_archive->addPages($docs, 2); $this->index_archive->buildInvertedIndexPartition(); $base_folder = $this->index_archive->getPartitionBaseFolder(0); $doc_map_filename = $base_folder . "/" . IndexDocumentBundle::DOC_MAP_FILENAME; $doc_map_tools = $this->index_archive->doc_map_tools; $doc_map = $doc_map_tools->load($doc_map_filename); $i = 0; foreach ($keys as $key) { $row = $doc_map_tools->find($doc_map, $key); //get row after bloom filter of terms $row = substr($row, IndexDocumentBundle::TERMSFILTER_LEN + 1); $entry = $doc_map_tools->unpack($row); $preface_length = str_word_count(UrlParser::getWordsInHostUrl( $docs[$i][CC::SUMMARY][CC::URL]) . " " . $docs[$i][CC::SUMMARY][CC::TITLE]) + 3; $entry_preface_length = ($entry[1]["POS"] & 255); $this->assertEqual($preface_length, $entry_preface_length, "Doc $i preface length matches calculated."); $i++; } $posting_tools = $this->index_archive->postings_tools; $postings_filename = $base_folder . "/" . IndexDocumentBundle::POSTINGS_FILENAME; $postings = $posting_tools->load($postings_filename); $stemmed_ball = L\PhraseParser::stemTerms("baseball", 'en-US')[0]; $baseball_id = L\canonicalTerm($stemmed_ball, true); $row = $posting_tools->find($postings, $baseball_id); $entry = $posting_tools->unpack($row); $this->assertEqual($entry[0]['FREQUENCY'], 1, "Test retrieve posting frequency"); $positions_filename = $base_folder . "/" . IndexDocumentBundle::POSITIONS_FILENAME; $encoded_positions = file_get_contents($positions_filename, false, null, $entry[0]['POSITIONS_OFFSET'], $entry[0]['POSITIONS_LEN']); $position_list = L\decodePositionList($encoded_positions, $entry[0]['FREQUENCY']); $host_word_count = str_word_count(UrlParser::getWordsInHostUrl( $docs[1][CC::SUMMARY][CC::URL])); $this->assertEqual($position_list[0], 1 + $host_word_count + 1 + 3, "Test Position List Decode"); } /** * Tests the complete process of going for documents, dedup, * building an inverted index and adding the result to the * IndexDocumentBundle's inverted index. To this after the above is * done perform lookup's of terms known to be in the indexed documents * and check the properties of the returned posting lists. */ public function addPartitionPostingsDictionaryTestCase() { $index_archive = $this->index_archive; $dictionary = $index_archive->dictionary; $keys = []; for ($i = 0; $i < 10; $i++) { $keys[$i] = $this->docidFromIntKeys($i, $i, $i); $docs[] = [ CC::DOC_ID => $keys[$i], CC::SUMMARY => [ CC::DESCRIPTION => "to$i be or$i not$i to$i be...", CC::HASH => str_pad("$i", 8, "0", STR_PAD_LEFT), CC::TITLE => "Some$i Shakespeare$i Play$i", CC::URL => "https://www.somewhere$i.com/" ], CC::PAGE => "Page $i", ]; } $num_docs = count($docs); $index_archive->addPages($docs, $num_docs); $index_archive->updateDictionary(); $index_archive->forceSave(); $be_term_id = L\canonicalTerm("be"); $term_row = $dictionary->get($be_term_id); $this->assertEqual(count($term_row), 2, "Two completed partitions"); $active_folder = $this->index_archive->getPartitionBaseFolder( $this->index_archive->documents->parameters["SAVE_PARTITION"]); $active_postings_filename = $active_folder . "/postings"; $posting_tools = $this->index_archive->postings_tools; $this->assertTrue(file_exists($active_postings_filename), "Active postings file exists"); $active_postings = []; if (file_exists($active_postings_filename)) { $active_dictionary = $posting_tools->load( $active_postings_filename); $active_term_row = $posting_tools->find($active_dictionary, $be_term_id); $active_postings = $posting_tools->unpack($active_term_row); } $sum = 0; for ($i = 0; $i < 2; $i++) { $sum += ($term_row[$i]['NUM_DOCS'] ?? 0); } $this->assertEqual($sum + count($active_postings), $num_docs, "Term 'be' occurs in correct number of documents"); for ($i = 0; $i < 2; $i++) { $partition = $term_row[$i]['PARTITION']; $partition_folder = $this->index_archive->getPartitionBaseFolder( $partition); $postings_filename = $partition_folder . "/" . IndexDocumentBundle::POSTINGS_FILENAME; $postings_string = file_get_contents($postings_filename, false, null, $term_row[$i]['POSTINGS_OFFSET'], $term_row[$i]['POSTINGS_LEN']); $postings = $posting_tools->unpack($postings_string); $base_folder = $this->index_archive->getPartitionBaseFolder( $term_row[$i]['PARTITION']); $positions_filename = $base_folder . "/" . IndexDocumentBundle::POSITIONS_FILENAME; $last = 0; for ($j = 0; $j < 4; $j++) { $encoded_positions = file_get_contents($positions_filename, false, null, $last + $postings[$j]['POSITIONS_OFFSET'], $postings[$j]['POSITIONS_LEN']); $last += $postings[$j]['POSITIONS_OFFSET']; $position_list = L\decodePositionList($encoded_positions, $postings[$j]['FREQUENCY']); $this->assertEqual($position_list[0], 9, "Test Position List Decode"); } } } /** * Computes a 24 byte docId by padding an int to the left with 0's * * @param int $i integer to make docId from * @param string $type default type of docid * @return string docid made by padding */ protected function docidFromInt($i, $type = self::TEST_DOC_TYPE) { $pre_key = str_pad("$i", 24, "0", STR_PAD_LEFT); $pre_key[IndexDocumentBundle::DOCID_PART_LEN << 1] = $type; return $pre_key; } /** * docids are typically made from three 8byte strings. This function * takes three ints and left pads each with '0' (\x30) and concatenates * then to make a 24 byte docid. As docids use their 8 byte to say whether * the id is for a document (replace with 'd') or a link (replace with 'l') * this function uses the value of the $is_doc flag to determine which value * overwrite the 8th byte with. * * @param int $i_hash_url an int for first 8 bytes (in non-artificial docids * would be for the crawlHash of url document from) * @param int $j_hash_page an int for first 8 bytes (in non-artificial * docids would be for the crawlHash of document) * @param int $k_hash_host an int for first 8 bytes (in non-artificial * docids would be for the crawlHash of hostname of site document from) * @param bool $is_doc whether the hash is for a document or a link * @return string 24 byte docid. */ protected function docidFromIntKeys($i_hash_url, $j_hash_page, $k_hash_host, $is_doc = true) { $doc_or_link = ($is_doc) ? "d" : "l"; return str_pad("$i_hash_url", 8, "0", STR_PAD_LEFT) . str_pad("$j_hash_page", 8, "0", STR_PAD_LEFT) . $doc_or_link . str_pad("$k_hash_host", 7, "0", STR_PAD_LEFT); } }