Last commit for tests/IndexShardTest.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2022  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2022
 * @filesource
 */
namespace seekquarry\yioop\tests;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\library\CrawlConstants;
use seekquarry\yioop\library\IndexShard;
use seekquarry\yioop\library\LinearAlgebra as LA;
use seekquarry\yioop\library\UnitTest;
use seekquarry\yioop\library\index_bundle_iterators\WordIterator;
use seekquarry\yioop\library\IndexManager;

/**
 * Used to test that the IndexShard class can properly add new documents
 * and retrieve those documents by word. Checks that doc offsets can be
 * updated, shards can be saved and reloaded
 *
 * @author Chris Pollett
 */
class IndexShardTest extends UnitTest
{
    /**
     * Construct some index shard we can add documents to
     */
    public function setUp()
    {
        $this->test_objects['shard'] = new IndexShard(C\WORK_DIRECTORY.
            "/shard.txt", 0);
        $this->test_objects['shard2'] = new IndexShard(C\WORK_DIRECTORY.
            "/shard2.txt", 0);
        $this->test_objects['shard3'] = new IndexShard(C\WORK_DIRECTORY.
            "/shard3.txt", 0);
        $this->test_objects['shard4'] = new IndexShard(C\WORK_DIRECTORY.
            "/shard4.txt", 0);
    }
    /**
     * Deletes any index shard files we may have created
     */
    public function tearDown()
    {
        set_error_handler(null);
        @unlink(C\WORK_DIRECTORY."/shard.txt");
        @unlink(C\WORK_DIRECTORY."/shard2.txt");
        @unlink(C\WORK_DIRECTORY."/shard3.txt");
        @unlink(C\WORK_DIRECTORY."/shard4.txt");
        set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
    }
    /**
     * Check if can store documents into an index shard and retrieve them
     */
    public function addDocumentsGetPostingsSliceByIdTestCase()
    {
        $docid = "AAAAAAAA";
        $doc_hash = "BBBBBBBB";
        $doc_hosts_url = "CCCCCCCC";
        $docid .= $doc_hash . $doc_hosts_url;
        $offset = 5;
        $word_counts = [
            'BBBBBBBB' => [1, 3],
            'CCCCCCCC' => [4, 9, 16],
            'DDDDDDDD' => [5, 25, 125],
        ];
        $meta_ids = ["EEEEEEEE", "FFFFFFFF"];
        $this->test_objects['shard']->addDocumentWords($docid,
            $offset, $word_counts, $meta_ids, ["EEEEEEEE"], true);
        $this->assertEqual($this->test_objects['shard']->len_all_docs, 8,
            "Len All Docs Correctly Counts Length of First Doc");
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('CCCCCCCC', true), 5);
        $this->assertTrue(isset($c_data[$docid]),
            "Doc lookup by word works");
        // add a second document and check
        $docid = "HHHHHHHH";
        $doc_hash = "IIIIIIII";
        $doc_hosts_url = "JJJJJJJJ";
        $docid .= $doc_hash. $doc_hosts_url;
        $offset = 7;
        $word_counts = [
            'CCCCCCCC' => [1, 4, 9],
            'GGGGGGGG' => [6],
        ];
        $meta_ids = ["YYYYYYYY"];
        $this->test_objects['shard']->addDocumentWords($docid,
            $offset, $word_counts, $meta_ids, [], true);
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('CCCCCCCC', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
            "Work lookup first item of two works");
        $this->assertTrue(isset($c_data["HHHHHHHHIIIIIIIIJJJJJJJJ"]),
            "Work lookup second item of two works");
        $this->assertEqual(count($c_data), 2,
            "Exactly two items were found in two item case");
        //add a meta word lookup
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('EEEEEEEE', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
            "Doc lookup by meta word works");
        $this->assertEqual(count($c_data), 1,
            "Doc lookup by meta word works has correct count");
        // add a third document and check
        $docid = "KKKKKKKK";
        $doc_hash = "LLLLLLLL";
        $doc_hosts_url = "MMMMMMMM";
        $docid .= $doc_hash. $doc_hosts_url;
        $offset = 10;
        $word_counts = [
            'BBBBBBBB' => [1, 3],
            'DDDD' => [1, 4, 9],
            'GG' => [6],
        ];
        $meta_ids = ["YYYYYYYY"];
        $this->test_objects['shard']->addDocumentWords($docid,
            $offset, $word_counts, $meta_ids, [], true);
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('BBBBBBBB', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
            "Doc lookup by meta word works");
        $this->assertTrue(isset($c_data["KKKKKKKKLLLLLLLLMMMMMMMM"]),
            "Doc lookup by meta word works");
        $this->assertEqual(count($c_data), 2,
            "Doc lookup by meta word works has correct count");
    }
    /**
     * Check if can iterate over posting slices in the reverse direction
     * To do this we construct two identical shards. We go over 'shard'
     * ascendingly, while we go over 'shard4'  descendingly and compare
     */
    public function addDocumentsGetPostingsSliceReverseTestCase()
    {
        $docid = "AAAAAAAA";
        $doc_hash = "BBBBBBBB";
        $doc_hosts_url = "CCCCCCCC";
        $docid .= $doc_hash . $doc_hosts_url;
        $offset = 5;
        $word_counts = [
            'BBBBBBBB' => [1, 3],
            'CCCCCCCC' => [4, 9, 16],
            'DDDDDDDD' => [5, 25, 125],
        ];
        $meta_ids = ["EEEEEEEE", "FFFFFFFF"];
        $this->test_objects['shard']->addDocumentWords($docid,
            $offset, $word_counts, $meta_ids, ["EEEEEEEE"], true);
        $this->test_objects['shard4']->addDocumentWords($docid,
            $offset, $word_counts, $meta_ids, ["EEEEEEEE"], true);
        $this->assertEqual($this->test_objects['shard']->len_all_docs, 8,
            "Len All Docs Correctly Counts Length of First Doc");
        // add a second document and check
        $docid = "HHHHHHHH";
        $doc_hash = "IIIIIIII";
        $doc_hosts_url = "JJJJJJJJ";
        $docid .= $doc_hash. $doc_hosts_url;
        $offset = 7;
        $word_counts = [
            'CCCCCCCC' => [1, 4, 9],
            'GGGGGGGG' => [6],
        ];
        $meta_ids = ["YYYYYYYY"];
        $this->test_objects['shard']->addDocumentWords($docid,
            $offset, $word_counts, $meta_ids, ['FFFFFFFF'], true);
        $this->test_objects['shard4']->addDocumentWords($docid,
            $offset, $word_counts, $meta_ids, ['FFFFFFFF'], true);
        // add a third document
        $docid = "ABABABAB";
        $doc_hash = "IJIJIJIJ";
        $doc_hosts_url = "KLKLKLKL";
        $docid .= $doc_hash. $doc_hosts_url;
        $offset = 50;
        $word_counts = [
            'the' => [1,9,12,17,19,42,52,95,103],
            'mineral' => [2],
            'known' => [3],
            'as' => [4,132],
            'kryptonite' => [5,32,74,112,114,129],
            'was' => [6,33,55,86,121,130],
            'introduced' => [7,34,131],
            'in' => [8,16,24,58,91,102],
            'radio' => [10,53],
            'serial' => [11,54],
            'adventures' => [13],
            'of' => [14],
            'superman' => [15,69,107,137],
            'story' => [18,29],
            'meteor' => [20,104],
            'from' => [21,105],
            'krypton' => [22,106],
            'broadcast' => [23],
            'june' => [25],
            '19433' => [26],
            'an' => [27,59],
            'apocryphal' => [28],
            'claims' => [30],
            'that' => [31,120],
            'to' => [35,44,67,111,117,138],
            'give' => [36],
            'supermans' => [37],
            'voice' => [38,78],
            'actor' => [39,79],
            'bud' => [40],
            'collyer' => [41,62,116],
            'possibility' => [43],
            'take' => [45,118],
            'a' => [46,49,76,122,133],
            'vacation' => [47],
            'at' => [48],
            'time' => [50],
            'when' => [51],
            'performed' => [56],
            'live' => [57],
            'episode' => [60],
            'where' => [61],
            'would' => [63,70,80],
            'not' => [64],
            'be' => [65,71],
            'present' => [66],
            'perform' => [68],
            'incapacitated' => [72],
            'by' => [73,88],
            'and' => [75],
            'substitute' => [77],
            'make' => [81],
            'groaning' => [82],
            'sounds' => [83],
            'this' => [84,101],
            'tale' => [85],
            'recounted' => [87],
            'julius' => [89],
            'schwartz' => [90],
            'his' => [92,140],
            'memoir4' => [93],
            'however' => [94],
            'historian' => [96],
            'michael' => [97],
            'j' => [98],
            'hayde' => [99],
            'disputes' => [100],
            'is' => [108],
            'never' => [109],
            'exposed' => [110],
            'if' => [113],
            'allowed' => [115],
            'vacations' => [119],
            'fringe' => [123],
            'benefit' => [124],
            'discovered' => [125],
            'later' => [126],
            'more' => [127],
            'likely' => [128],
            'plot' => [134],
            'device' => [135],
            'for' => [136],
            'discover' => [139],
            'origin' => [141],
        ];
        $meta_ids = ["ZZZZZZZZ"];
        $this->test_objects['shard']->addDocumentWords($docid,
            $offset, $word_counts, $meta_ids, ['GGGGGGGG'], true);
        $this->test_objects['shard4']->addDocumentWords($docid,
            $offset, $word_counts, $meta_ids, ['GGGGGGGG'], true);
        $forward = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('the', true), 5);
        $this->assertTrue(isset($forward[$docid]),
            "Doc lookup by word works for shard");
        $backward = $this->test_objects['shard4']->getPostingsSliceById(
            L\crawlHashWord('the', true), 5, IndexShard::DESCENDING);
        $this->assertTrue(isset($backward[$docid]),
            "Doc lookup by word works for shard4");
        $this->assertEqual($forward, $backward,
            "Both only have one document with this word");
        $info = $this->test_objects['shard']->getWordInfo(
            L\crawlHashWord('CCCCCCCC', true), true);
        list($first_offset, $last_offset, $num_docs_or_links) = $info;
        $this->assertEqual($first_offset, 36,
            "First posting offset for CCCCCCCC set correctly");
        $this->assertEqual($last_offset, 40,
            "Second posting offset for CCCCCCCC set correctly");
        $this->assertEqual($num_docs_or_links, 2,
            "Term CCCCCCCC appears in the correct number of documents");
        $forward = $this->test_objects['shard']->nextPostingOffsetDocOffset(
            $first_offset, $last_offset, 5);
        $this->assertEqual($forward[0], 36,
            "Search ascending finds correct next posting offset");
        $backward = $this->test_objects['shard4']->nextPostingOffsetDocOffset(
            $first_offset, $last_offset, 37, IndexShard::DESCENDING);
        $this->assertEqual($forward[0], 36,
            "Search descending finds correct next posting offset");
        /*
            Now we check posting slices between ascending descending are
            reversed
         */
        $forward = $this->test_objects['shard']->getPostingsSlice($first_offset,
            $first_offset, $last_offset, $num_docs_or_links);
        // have to reset offset values, since getPostingsSlice modifies by ref
        $info = $this->test_objects['shard4']->getWordInfo(
            L\crawlHashWord('CCCCCCCC', true), true);
        list($first_offset, $last_offset, $num_docs_or_links) = $info;
        $backward = $this->test_objects['shard4']->getPostingsSlice(
            $first_offset, $last_offset, $last_offset, $num_docs_or_links,
            IndexShard::DESCENDING);
        $this->assertEqual(array_keys($forward),
            array_reverse(array_keys($backward)),
            "DESCENDING Slice returns a reversed version off a ASCENDING one");
    }
    /**
     * Check if can store link documents into an index shard and retrieve them
     */
    public function addLinkGetPostingsSliceByIdTestCase()
    {
        $docid = "AAAAAAAABBBBBBBBCCCCCCCC"; //set up link doc
        $offset = 5;
        $word_counts = [
            'MMMMMMMM' => [1, 3, 5],
            'NNNNNNNN' => [2, 4, 6],
            'OOOOOOOO' => [7, 8, 9],
        ];
        $meta_ids = ["PPPPPPPP", "QQQQQQQQ"];
        $this->test_objects['shard']->addDocumentWords($docid,
            $offset, $word_counts, $meta_ids);
        $this->assertEqual($this->test_objects['shard']->len_all_link_docs, 9,
            "Len All Docs Correctly Counts Length of First Doc");
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('MMMMMMMM', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
            "Link Doc lookup by word works");
        $docid = "AAAAAAAA";
        $doc_hash = "BBBBBBBB";
        $docid .= $doc_hash."EEEEEEEE";
        $offset = 10;
        $word_counts = [
            'BBBBBBBB' => [1],
            'CCCCCCCC' => [2],
            'MMMMMMMM' => [6],
        ];
        $meta_ids = ["EEEEEEEE", "FFFFFFFF"];
        $this->test_objects['shard']->addDocumentWords($docid,
            $offset, $word_counts, $meta_ids, true);
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('MMMMMMMM', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
            "Link Doc lookup by word works 1st of two");
        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBEEEEEEEE"]),
            "Link Doc lookup by word works 2nd doc");
        $this->assertEqual(count($c_data), 2,
            "Link Doc lookup by word works has correct count");
        $docid = "AAAAAAAA";
        $doc_hash = "BBBBBBBB";
        $docid .= $doc_hash."GGGGGGGG";
        $offset = 21;
        $word_counts = [
            'DDDDDDDD' => [1, 4],
            'LLLLLLLL' => [7, 9],
            'NNNNNNNN' => [2],
        ];
        $meta_ids = ["EEEEEEEE", "FFFFFFFF"];
        $this->test_objects['shard']->addDocumentWords($docid,
            $offset, $word_counts, $meta_ids, true);
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('NNNNNNNN', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
            "Link Doc lookup by word works 1st of two");
        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBGGGGGGGG"]),
            "Link Doc lookup by word works 2nd doc");
            $this->assertFalse(isset($c_data["AAAAAAAABBBBBBBBEEEEEEEE"]),
            "Link Doc lookup should not have third document");
        $this->assertEqual(count($c_data), 2,
            "Link Doc lookup by word works has correct count");
    }
    /**
     * Check that appending two index shards works correctly
     */
    public function appendIndexShardTestCase()
    {
        $docid = "AAAAAAAA"; //it actually shouldn't matter if have one or more
            // 8 byte doc_keys, both should be treated as documents
        $offset = 5;
        $word_lists = [
            'BBBBBBBB' => [1],
            'CCCCCCCC' => [2],
            'DDDDDDDD' => [6],
        ];
        $meta_ids = ["EEEEEEEE", "FFFFFFFF"];
        //adding a document to the first shard
        $this->test_objects['shard']->addDocumentWords($docid,
            $offset, $word_lists, $meta_ids, true);
        $docid = "KKKKKKKKGGGGGGGGHHHHHHHH";
        $offset = 20;
        $word_lists = [
            'ZZZZZZZZ' => [9],
            'DDDDDDDD' => [4],
        ];
        $meta_ids = [];
        //adding a document to the second shard
        $this->test_objects['shard2']->addDocumentWords($docid,
            $offset, $word_lists, $meta_ids);
        $docid = "GGGGGGGG";
        $offset = 6;
        $word_lists = [
            'DDDDDDDD' => [3],
            'IIIIIIII' => [4],
            'JJJJJJJJ' => [5],
        ];
        $meta_ids = ["KKKKKKKK"];
        //adding another document to the second shard
        $this->test_objects['shard2']->addDocumentWords($docid,
            $offset, $word_lists, $meta_ids);
        $this->test_objects['shard']->appendIndexShard(
            $this->test_objects['shard2']);
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('BBBBBBBB', true), 5);
        $tmp = array_keys($c_data);
        $this->assertTrue(isset($c_data["AAAAAAAA"]),
            "Data from first shard present 1");
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('CCCCCCCC', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAA"]),
            "Data from first shard present 2");
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('DDDDDDDD', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAA"]),
            "Data from first shard present 3");
        $this->assertTrue(isset($c_data["KKKKKKKKGGGGGGGGHHHHHHHH"]),
            "Data from second shard present 1");
        $this->assertTrue(isset($c_data["GGGGGGGG"]),
            "Data from third shard present 1");
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('EEEEEEEE', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAA"]),
            "Data from first shard present 4");
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('FFFFFFFF', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAA"]),
            "Data from first shard present 5");
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('ZZZZZZZZ', true), 5);
        $this->assertTrue(isset($c_data["KKKKKKKKGGGGGGGGHHHHHHHH"]),
            "Data from second shard present 2");
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('IIIIIIII', true), 5);
        $this->assertTrue(isset($c_data["GGGGGGGG"]),
            "Data from third shard present 2");
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('JJJJJJJJ', true), 5);
        $this->assertTrue(isset($c_data["GGGGGGGG"]),
            "Data from third shard present 3");
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('KKKKKKKK', true), 5);
        $this->assertTrue(isset($c_data["GGGGGGGG"]),
            "Data from third shard present 4");
        $docid = "HHHHHHHHDDDDDDDD";
        $offset = 50;
        $word_lists = [
            'YYYYYYYY' => [3, 7],
        ];
        $meta_ids = [];
        //adding a document to the third shard
        $this->test_objects['shard3']->addDocumentWords($docid,
            $offset, $word_lists, $meta_ids);
        $this->test_objects['shard']->appendIndexShard(
            $this->test_objects['shard3']);
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('YYYYYYYY', true), 5);
        $this->assertTrue(isset($c_data["HHHHHHHHDDDDDDDD"]),
            "Data from third shard present 5");
    }
    /**
     * Check that changing document offsets works
     */
    public function changeDocumentOffsetTestCase()
    {
        $docid = "AAAAAAAASSSSSSSS";
        $offset = 0;
        $word_lists = [
            'BBBBBBBB' => [1]
        ];
        $meta_ids = [];
        $this->test_objects['shard']->addDocumentWords($docid,
            $offset, $word_lists, $meta_ids);
        $docid = "AAAAAAAAEEEEEEEEFFFFFFFF";
        $offset = 0;
        $word_lists = [
            'BBBBBBBB' => [1]
        ];
        $meta_ids = [];
        $this->test_objects['shard']->addDocumentWords($docid,
            $offset, $word_lists, $meta_ids);
        $docid = "CCCCCCCCFFFFFFFF";
        $offset = 0;
        $word_lists = [
            'BBBBBBBB' => [1],
            'ZZZZZZZZ' => [1]
        ];
        $meta_ids = [];
        $this->test_objects['shard']->addDocumentWords($docid,
            $offset, $word_lists, $meta_ids);
        $docid = "QQQQQQQQEEEEEEEEFFFFFFFF";
        $offset = 0;
        $word_lists = [
            'BBBBBBBB' => [1]
        ];
        $meta_ids = [];
        $this->test_objects['shard']->addDocumentWords($docid,
            $offset, $word_lists, $meta_ids);
        $docid = "DDDDDDDD";
        $offset = 0;
        $word_lists = [
            'BBBBBBBB' => [1]
        ];
        $meta_ids = [];
        $this->test_objects['shard']->addDocumentWords($docid,
            $offset, $word_lists, $meta_ids);
        $new_doc_offsets = [
            "AAAAAAAASSSSSSSS" => 5,
            "AAAAAAAAEEEEEEEEFFFFFFFF" => 10,
            "QQQQQQQQEEEEEEEEFFFFFFFF" => 9,
            "DDDDDDDD" => 7,
        ];
        $this->test_objects['shard']->changeDocumentOffsets($new_doc_offsets);
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('BBBBBBBB', true), 5);
        $predicted_offsets = [
            "AAAAAAAASSSSSSSS" => 5,
            "CCCCCCCCFFFFFFFF" => 0,
            "AAAAAAAAEEEEEEEEFFFFFFFF" => 10,
            "QQQQQQQQEEEEEEEEFFFFFFFF" => 9,
            "DDDDDDDD" => 7,
        ];
        $i = 0;
        foreach ($predicted_offsets as $key =>$offset) {
            $this->assertTrue(isset($c_data[$key]),
                "Summary key matches predicted $key");
            $this->assertEqual($c_data[$key][CrawlConstants::SUMMARY_OFFSET],
                $offset,  "Summary offset matches predicted offset $offset");
            $i++;
        }
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('ZZZZZZZZ', true), 5);
        $this->assertEqual($c_data['CCCCCCCCFFFFFFFF']
                [CrawlConstants::SUMMARY_OFFSET],
                0,  "Summary offset matches predicted second word");
        // adding new document after changing offset once
        $docid = "AAAAAAAASSSSSSSD";
        $offset = 0;
        $word_lists = [
            'BBBBBBBB' => [1]
        ];
        $meta_ids = [];
        $this->test_objects['shard']->addDocumentWords($docid,
            $offset, $word_lists, $meta_ids);
        $new_doc_offsets = [
            "QQQQQQQQEEEEEEEEFFFFFFFF" => 10,
            "DDDDDDDD" => 6,
        ];
        $this->test_objects['shard']->changeDocumentOffsets($new_doc_offsets);
        $c_data = $this->test_objects['shard']->getPostingsSliceById(
            L\crawlHashWord('BBBBBBBB', true), 6);
        $predicted_offsets = [
            "AAAAAAAASSSSSSSS" => 5,
            "CCCCCCCCFFFFFFFF" => 0,
            "AAAAAAAAEEEEEEEEFFFFFFFF" => 10,
            "QQQQQQQQEEEEEEEEFFFFFFFF" => 10,
            "DDDDDDDD" => 6,
        ];
        foreach ($predicted_offsets as $key =>$offset) {
            $this->assertTrue(isset($c_data[$key]),
                "Summary key matches predicted $key");
            $this->assertEqual($c_data[$key][CrawlConstants::SUMMARY_OFFSET],
                $offset,  "Summary offset matches predicted offset $offset");
            $i++;
        }
    }
    /**
     * Used to test the functions related to add auxiliary document keys
     * the document key of a document in an IndexShard.
     */
    public function documentKeyTestCase()
    {
        $shard = $this->test_objects['shard'];
        $to_pack = [99, 5, 12];
        $packed = $shard->packValues($to_pack);
        $unpacked = $shard->unpackValues($packed);
        $this->assertEqual($unpacked[0], $to_pack,
            "Pack unpack array int values no offset");
        $this->assertEqual($unpacked[1], 8,
            "Pack unpack array int values with 0 offset offset correct");
        $unpacked = $shard->unpackValues("\x00\x00". $packed, 2);
        $this->assertEqual($unpacked[0], $to_pack,
            "Pack unpack array int values with offset");
        $this->assertEqual($unpacked[1], 10,
            "Pack unpack array int values with 2 offset offset correct");
        $to_pack = [.1, .3, .55, .66, .2];
        $packed = $shard->packValues($to_pack, "f");
        $unpacked = $shard->unpackValues($packed, 0, "f");
        $this->assertTrue(LA::distance($unpacked[0], $to_pack) < 0.01,
            "Pack unpack array float values no offset");
        $this->assertEqual($unpacked[1], 12,
            "Pack unpack array float values with 0 offset offset correct");
        $unpacked = $shard->unpackValues("\x00\x00". $packed, 2, 'f');
        $this->assertTrue(LA::distance($unpacked[0], $to_pack) < 0.01,
            "Pack unpack array float values with offset");
        $this->assertEqual($unpacked[1], 14,
            "Pack unpack array float values with 2 offset offset correct");
        $description_scores = [10 => .3, 21 => .1, 198 => .55];
        $user_ranks = [.88, .99, .77];
        $packed = $shard->packAuxiliaryDocumentKeys($description_scores,
            $user_ranks);
        $unpacked = $shard->unpackAuxiliaryDocumentKeys($packed);
        $this->assertEqual(array_keys($unpacked[0]),
            array_keys($description_scores),
            "Description score positions match after pack unpack");
        $this->assertTrue(LA::distance($unpacked[0],$description_scores) < 0.01,
            "Description scores close after pack unpack");
        $this->assertTrue(LA::distance($unpacked[1], $user_ranks) < 0.01,
            "User ranks close after pack unpack");
        $description_scores = [];
        $packed = $shard->packAuxiliaryDocumentKeys($description_scores,
            $user_ranks);
        $unpacked = $shard->unpackAuxiliaryDocumentKeys($packed);
        $this->assertEqual(array_keys($unpacked[0]),
            array_keys($description_scores),
            "Empty description score positions match after pack unpack");
        $this->assertTrue(LA::distance($unpacked[0],$description_scores) < 0.01,
            "Empty description scores close after pack unpack");
        $this->assertTrue(LA::distance($unpacked[1], $user_ranks) < 0.01,
            "User ranks close after pack unpack empty description score case");
        $description_scores = [10 => .3, 21 => .1, 198 => .55];
        $user_ranks = [];
        $packed = $shard->packAuxiliaryDocumentKeys($description_scores,
            $user_ranks);
        $unpacked = $shard->unpackAuxiliaryDocumentKeys($packed);
        $this->assertEqual(array_keys($unpacked[0]),
            array_keys($description_scores), "Description score positions ".
            "match after pack unpack, empty user rank");
        $this->assertTrue(LA::distance($unpacked[0],$description_scores) < 0.01,
            "Description scores close after pack unpack, empty user rank");
        $this->assertTrue(LA::distance($unpacked[1], $user_ranks) < 0.01,
            "User ranks empty after pack unpack empty user rank case");
        $description_scores = [];
        $user_ranks = [];
        $packed = $shard->packAuxiliaryDocumentKeys($description_scores,
            $user_ranks);
        $unpacked = $shard->unpackAuxiliaryDocumentKeys($packed);
        $this->assertEqual(array_keys($unpacked[0]),
            array_keys($description_scores), "Description score positions ".
            "match after pack unpack, empty all case");
        $this->assertTrue(LA::distance($unpacked[0],$description_scores) < 0.01,
            "Description scores close after pack unpack, empty all case");
        $this->assertTrue(LA::distance($unpacked[1], $user_ranks) < 0.01,
            "User ranks empty after pack unpack empty all case");
    }
    /**
     * Check that save and load work
     */
    public function saveLoadTestCase()
    {
        $docid = "AAAAAAAABBBBBBBBCCCCCCCC";
        $offset = 5;
        $word_counts = [
            'BBBBBBBB' => [1],
            'CCCCCCCC' => [2],
            'DDDDDDDD' => [6],
        ];
        $meta_ids = ["EEEEEEEE", "FFFFFFFF"];
        //test saving and loading to a file
        $this->test_objects['shard']->addDocumentWords($docid,
            $offset, $word_counts, $meta_ids, true);
        $this->test_objects['shard']->save();
        $this->test_objects['shard2'] = IndexShard::load(C\WORK_DIRECTORY.
            "/shard.txt");
        $word_info = $this->test_objects['shard']->getWordInfo(
            L\crawlHashWord('FFFFFFFF'));
        $this->assertEqual($this->test_objects['shard2']->len_all_docs, 3,
            "Len All Docs Correctly Counts Length of First Doc");
        $c_data = $this->test_objects['shard2']->getPostingsSliceById(
            L\crawlHashWord('BBBBBBBB', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
            "Doc lookup by word works");
        $c_data = $this->test_objects['shard2']->getPostingsSliceById(
            L\crawlHashWord('CCCCCCCC', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
            "Doc lookup 2 by word works");
        $c_data = $this->test_objects['shard2']->getPostingsSliceById(
            L\crawlHashWord('DDDDDDDD', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
            "Doc lookup 2 by word works");
        $c_data = $this->test_objects['shard2']->getPostingsSliceById(
            L\crawlHashWord('EEEEEEEE', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
            "Doc lookup 2 by word works");
        $c_data = $this->test_objects['shard2']->getPostingsSliceById(
            L\crawlHashWord('FFFFFFFF', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
            "Doc lookup 2 by word works");
        // test saving and loading from a string
        $out_string = $this->test_objects['shard']->save(true);
        $this->test_objects['shard2'] = IndexShard::load("shard.txt",
            $out_string);
        $c_data = $this->test_objects['shard2']->getPostingsSliceById(
            L\crawlHashWord('BBBBBBBB', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
            "String Load Doc lookup by word works");
        $c_data = $this->test_objects['shard2']->getPostingsSliceById(
            L\crawlHashWord('CCCCCCCC', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
            "String Load Doc lookup 2 by word works");
        $c_data = $this->test_objects['shard2']->getPostingsSliceById(
            L\crawlHashWord('DDDDDDDD', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
            "String Load Doc lookup 2 by word works");
        $c_data = $this->test_objects['shard2']->getPostingsSliceById(
            L\crawlHashWord('EEEEEEEE', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
            "String Load Doc lookup 2 by word works");
        $c_data = $this->test_objects['shard2']->getPostingsSliceById(
            L\crawlHashWord('FFFFFFFF', true), 5);
        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
            "String Load Doc lookup 2 by word works");
        // Check if save without dictionary preserves postings
        $word_info = $this->test_objects['shard']->getWordInfo(
            L\crawlHashWord('FFFFFFFF'));
        $this->test_objects['shard']->saveWithoutDictionary();
        $shard = new IndexShard(C\WORK_DIRECTORY .
            "/shard.txt", 0, C\NUM_DOCS_PER_PARTITION, true);
        $c_data = $shard->getPostingsSlice($word_info[0],
            $word_info[0], $word_info[1], 5);
        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
            "Save without dictionary test works");
    }
}
ViewGit