First pass at modifying Yioop to again use Logarithmic Merge Trees for its dictionary structures
First pass at modifying Yioop to again use Logarithmic Merge Trees for its dictionary structures
Folder structure of IndexDocumentBundles also modified and now supports overflow folder (which
could be on a different hard drive). ArcTool has been updated to support migration to new
indexes
<?php
/**
* SeekQuarry/Yioop --
* Open Source Pure PHP Search Engine, Crawler, and Indexer
*
* Copyright (C) 2009 - 2022 Chris Pollett chris@pollett.org
*
* LICENSE:
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* END LICENSE
*
* This file contains unit tests of the IndexDocumentBundleTable class
*
* @author Chris Pollett chris@pollett.org
* @license https://www.gnu.org/licenses/ GPL3
* @link https://www.seekquarry.com/
* @copyright 2009 - 2022
* @filesource
*/
namespace seekquarry\yioop\tests;
use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\models\Model;
use seekquarry\yioop\library\CrawlConstants as CC;
use seekquarry\yioop\library\IndexDocumentBundle;
use seekquarry\yioop\library\IndexManager;
use seekquarry\yioop\library\UnitTest;
/**
* Used to run unit tests for the IndexManager class. IndexManager acts a
* a resource manager for the open indexes used to process a query.
*/
class IndexManagerTest extends UnitTest
{
/**
* Name of the included pre version3 Index bundle to test
*/
const OLD_BUNDLE = "IndexData1421025145";
/**
* Name of the included version3 Index bundle to test
*/
const NEW_BUNDLE = "IndexData200000000";
/**
* Prefix of folders for index manager test
*/
const TEST_DIR = __DIR__ . '/test_files/index_manager_test';
/**
* Before each test sets up a test directory with an old and new bundle in
* it
*/
public function setUp()
{
$old_bundle = self::OLD_BUNDLE;
$new_bundle = self::NEW_BUNDLE;
if (!file_exists(self::TEST_DIR)) {
mkdir(self::TEST_DIR);
}
$this->index_archive = new IndexDocumentBundle(self::TEST_DIR .
"/$new_bundle", false, "TestBundle", 3, 5);
}
/**
* Used after each test to delete the test directory
*/
public function tearDown()
{
$model = new Model();
$model->db->unlinkRecursive(self::TEST_DIR);
}
/**
* Test case if IndexManager can determine the type and version of an index
* to be able to return an instantiated version fo that index.
*/
public function getIndexTestCase()
{
$new_index = IndexManager::getIndex(self::TEST_DIR . "/".
self::NEW_BUNDLE);
$this->assertTrue($new_index, "Get existing new bundle not null");
}
/**
* Test case used to test whether or not the index manager can determine
* the version of a Yioop index.
*/
public function getVersionTestCase()
{
$version_new = IndexManager::getVersion(self::TEST_DIR . "/".
self::NEW_BUNDLE);
$this->assertEqual($version_new, "3.2", "Version 3.2 index detected");
}
/**
* Tests if IndexManager can return the dictionary information about a
* word stored in an index it manages.
*/
public function getWordInfoTestCase()
{
IndexManager::clearCache();
$index_archive = $this->index_archive;
$dictionary = $index_archive->dictionary;
$keys = [];
$a = "";
for ($i = 0; $i < 10; $i++) {
$keys[$i] = $this->docidFromIntKeys($i, $i, $i);
$docs[] = [
CC::DOC_ID => $keys[$i],
CC::SUMMARY =>
[
CC::DESCRIPTION => "$a to$i be or$i not$i to$i be...",
CC::HASH => str_pad("$i", 8, "0", STR_PAD_LEFT),
CC::TITLE => "Some$i Shakespeare$i Play$i",
CC::URL => "https://www.somewhere$i.com/"
],
CC::PAGE => "Page $i",
];
if ($i > 3) {
$a = "aha be ";
}
}
$num_docs = count($docs);
$index_archive->addPages($docs, $num_docs);
$index_archive->updateDictionary();
$index_archive->forceSave();
$be_info = IndexManager::getWordInfo(self::TEST_DIR . "/".
self::NEW_BUNDLE, L\canonicalTerm("be"));
$this->assertEqual(count($be_info['ROWS']), 3,
"The word 'be' occurs in three partitions");
$this->assertTrue(is_array($be_info['ROWS'][2]['POSTINGS']),
"Active partition postings has been returned as array.");
$index = IndexManager::getIndex(self::TEST_DIR . "/".
self::NEW_BUNDLE);
$postings = $index->getPostingsString(0,
$be_info['ROWS'][0]['POSTINGS_OFFSET'],
$be_info['ROWS'][0]['POSTINGS_LEN']);
$posting_rows = $index->postings_tools->unpack($postings);
$this->assertEqual(count($posting_rows), 4,
"Able to look up postings for Partition 0 of 'be' Word Info.");
}
/**
* Computes a 24 byte docId by padding an int to the left with 0's
*
* @param int $i integer to make docId from
* @return string docid made by padding
*/
protected function docidFromInt($i)
{
return str_pad("$i", 24, "0", STR_PAD_LEFT);
}
/**
* docids are typically made from three 8byte strings. This function
* takes three ints and left pads each with '0' (\x30) and concatenates
* then to make a 24 byte docid. As docids use their 8 byte to say whether
* the id is for a document (replace with 'd') or a link (replace with 'l')
* this function uses the value of the $is_doc flag to determine which value
* overwrite the 8th byte with.
*
* @param int $i_hash_url an int for first 8 bytes (in non-artificial docids
* would be for the crawlHash of url document from)
* @param int $j_hash_page an int for first 8 bytes (in non-artificial
* docids would be for the crawlHash of document)
* @param int $k_hash_host an int for first 8 bytes (in non-artificial
* docids would be for the crawlHash of hostname of site document from)
* @param bool $is_doc whether the hash is for a document or a link
* @return string 24 byte docid.
*/
protected function docidFromIntKeys($i_hash_url, $j_hash_page,
$k_hash_host, $is_doc = true)
{
$doc_or_link = ($is_doc) ? "d" : "l";
return str_pad("$i_hash_url", 8, "0", STR_PAD_LEFT) .
str_pad("$j_hash_page", 8, "0", STR_PAD_LEFT) .
$doc_or_link .
str_pad("$k_hash_host", 7, "0", STR_PAD_LEFT);
}
}