Adjust copyrights years
<?php
/**
* SeekQuarry/Yioop --
* Open Source Pure PHP Search Engine, Crawler, and Indexer
*
* Copyright (C) 2009 - 2024 Chris Pollett chris@pollett.org
*
* LICENSE:
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* END LICENSE
*
* @author Chris Pollett chris@pollett.org
* @license https://www.gnu.org/licenses/ GPL3
* @link https://www.seekquarry.com/
* @copyright 2009 - 2024
* @filesource
*/
namespace seekquarry\yioop\library\index_bundle_iterators;
use seekquarry\yioop\configs as C;
use seekquarry\yioop\library\IndexManager;
use seekquarry\yioop\library\PhraseParser;
/**
* Used to iterate over the documents which occur in any of a set of
* WordIterator results
*
* @author Chris Pollett
* @see IndexArchiveBundle
*/
class UnionIterator extends IndexBundleIterator
{
/**
* An array of iterators whose intersection we get documents from
* @var array
*/
public $index_bundle_iterators;
/**
* @var int
*/
public $max_solution_docs;
/**
* Number of elements in $this->index_bundle_iterators
* @var int
*/
public $num_iterators;
/**
* @var int
*/
public $count_block_unfiltered;
/**
* @var int
*/
public $seen_docs_unfiltered;
/**
* The timestamp of the index associated with this iterator
* @var string
*/
public $index_name;
/**
* Creates a union iterator with the given parameters.
*
* @param object $index_bundle_iterators to use as a source of documents
* to iterate over
* @param string $index_name time_stamp of the index to use
*/
public function __construct($index_bundle_iterators,
$index_name)
{
/*
estimate number of results by sum of all iterator counts,
then improve estimate as iterate
*/
$num_iterators = count($index_bundle_iterators);
$this->num_iterators = $num_iterators;
$this->num_docs = 0;
/*
result_per_block is at most the sum of
results_per_block of things we are iterating. Value
is already init'd in base class.
*/
$this->results_per_block = intval(C\MIN_RESULTS_TO_GROUP);
$this->seen_docs = 0;
$this->seen_docs_unfiltered = 0;
$this->index_name = $index_name;
$num_smaller = array_fill(0, $num_iterators, 0);
$this->max_solution_docs = 0;;
for ($i = 0; $i < $num_iterators; $i++) {
$index_bundle_iterators[$i]->setResultsPerBlock(1);
$num_docs = $index_bundle_iterators[$i]->num_docs;
$this->max_solution_docs += $num_docs;
for ($j = 0; $j < $i; $j++) {
if ($num_docs < $index_bundle_iterators[$j]->num_docs) {
$num_smaller[$j]++;
}
}
}
asort($num_smaller);
$i = 0;
foreach ($num_smaller as $index => $count) {
$this->index_bundle_iterators[$i] =
$index_bundle_iterators[$index];
$i++;
}
}
/**
* Returns CrawlConstants::ASCENDING or CrawlConstants::DESCENDING
* depending on the direction in which this iterator traverse the
* underlying index archive bundle.
*
* @return int direction traversing underlying archive bundle
*/
public function getDirection()
{
if (!empty($this->index_bundle_iterators[0])) {
return $this->index_bundle_iterators[0]->getDirection();
}
return self::ASCENDING;
}
/**
* Returns the iterators to the first document block that it could iterate
* over
*/
public function reset()
{
for ($i = 0; $i < $this->num_iterators; $i++) {
$this->index_bundle_iterators[$i]->reset();
}
$this->seen_docs = 0;
$this->seen_docs_unfiltered = 0;
$doc_block = $this->currentDocsWithWord();
}
/**
* Hook function used by currentDocsWithWord to return the current block
* of docs if it is not cached
*
* @return mixed doc ids and score if there are docs left, -1 otherwise
*/
public function findDocsWithWord()
{
$pages = [];
$found_docs = false;
$num_iterators = $this->num_iterators;
$iterators = $this->index_bundle_iterators;
$this->count_block_unfiltered = 0;
$direction = $this->getDirection();
$max_accumulators = intval($this->results_per_block);
$to_accumulators = [];
for ($i = 0; $i < $num_iterators; $i++) {
$from_accumulators = $to_accumulators;
$max_in_pos = count($from_accumulators);
$to_accumulators = [];
$quota_left = $max_accumulators - $max_in_pos;
$iterator = $iterators[$i];
if ($quota_left == 0) {
for ($j = 0; $j < $max_accumulators; $j++) {
$current_gen_doc_offset = $from_accumulators[$j][
self::GEN_OFFSET];
$iterator_offset = $iterator->currentGenDocOffsetWithWord();
if ($this->genDocOffsetCmp($iterator_offset,
$current_gen_doc_offset, $direction) < 0) {
$iterator->advance($current_gen_doc_offset);
$this->count_block_unfiltered++;
}
if( ($iterator_offset =
$iterator->currentGenDocOffsetWithWord()) == -1) {
break;
}
$to_accumulators[$j] = $from_accumulators[$j];
$cmp = $this->genDocOffsetCmp($iterator_offset,
$current_gen_doc_offset, $direction);
if ($cmp == 0) {
$docs = $iterator->findDocsWithWord();
if (is_array($docs) && count($docs) == 1) {
$keys = array_keys($docs);
$doc = $docs[$keys[0]];
$to_accumulators[$j][self::RELEVANCE] +=
$doc[self::RELEVANCE];
}
}
}
for ($k = $j; $k < $max_accumulators; $k++) {
$to_accumulators[$k] = $from_accumulators[$k];
}
} else {
$in_pos = 0;
$out_pos = 0;
while ($out_pos < $max_accumulators) {
if(($iterator_offset =
$iterator->currentGenDocOffsetWithWord()) == -1) {
break;
}
if ($in_pos < $max_in_pos) {
$current_gen_doc_offset = $from_accumulators[$in_pos][
self::GEN_OFFSET];
$cmp = $this->genDocOffsetCmp($iterator_offset,
$current_gen_doc_offset, $direction);
} else {
$cmp = -1;
}
if ($cmp < 0) {
$remaining_in_accumulator =
$max_in_pos - $in_pos - 1;
if ($max_accumulators - $out_pos
> $remaining_in_accumulator) {
$docs = $iterator->findDocsWithWord();
if (is_array($docs) && count($docs) == 1) {
$keys = array_keys($docs);
$doc = $docs[$keys[0]];
$to_accumulators[$out_pos] = $doc;
$to_accumulators[$out_pos][self::GEN_OFFSET] =
$iterator_offset;
$out_pos++;
}
$iterator->advance();
} else {
$to_accumulators[$out_pos++] =
$from_accumulators[$in_pos++];
}
} else if($cmp == 0) {
$to_accumulators[$out_pos] =
$from_accumulators[$in_pos];
$docs = $iterator->findDocsWithWord();
if (is_array($docs) && count($docs) == 1) {
$keys = array_keys($docs);
$doc = $docs[$keys[0]];
$to_accumulators[$out_pos][self::RELEVANCE] +=
$doc[self::RELEVANCE];
}
$out_pos++;
$in_pos++;
} else {
$to_accumulators[$out_pos++] =
$from_accumulators[$in_pos++];
}
$this->count_block_unfiltered++;
}
}
}
$pages = [];
foreach ($to_accumulators as $accumulator) {
if (!empty($accumulator[self::KEY])) {
$accumulator[self::SCORE] = $accumulator[self::DOC_RANK] +
$accumulator[self::RELEVANCE];
$page_acc = $pages[$accumulator[self::KEY]] ?? null;
if (empty($page_acc)) {
$page_acc = $accumulator;
} else if (empty($accumulator[self::IS_DOC]) &&
$page_acc[self::SCORE] > $accumulator[self::SCORE]) {
$page_acc[self::DOC_RANK] += $accumulator[self::DOC_RANK];
$page_acc[self::RELEVANCE] += $accumulator[self::RELEVANCE];
$page_acc[self::SCORE] = $page_acc[self::DOC_RANK] +
$page_acc[self::RELEVANCE];
} else {
$accumulator[self::DOC_RANK] += $page_acc[self::DOC_RANK];
$accumulator[self::RELEVANCE] += $page_acc[self::RELEVANCE];
$page_acc = $accumulator;
$page_acc[self::SCORE] = $page_acc[self::DOC_RANK] +
$page_acc[self::RELEVANCE];
}
$pages[$accumulator[self::KEY]] = $page_acc;
}
}
if (empty($pages)) {
return -1;
}
$this->pages = $pages;
$this->count_block = count($pages);
return $pages;
}
/**
* Forwards the iterator one group of docs
* @param array $gen_doc_offset a generation, doc_offset pair. If set,
* the must be of greater than or equal generation, and if equal the
* next block must all have $doc_offsets larger than or equal to
* this value
*/
public function advance($gen_doc_offset = null)
{
$this->current_block_fresh = false;
$this->seen_docs += $this->count_block;
$this->seen_docs_unfiltered += $this->count_block_unfiltered;
$this->num_docs =
floor(($this->seen_docs * $this->max_solution_docs) /
$this->seen_docs_unfiltered);
if ($gen_doc_offset != null) {
foreach ($this->index_bundle_iterators as $iterator) {
$iterator->advance($gen_doc_offset);
}
}
}
/**
* This method is supposed to set
* the value of the result_per_block field. This field controls
* the maximum number of results that can be returned in one go by
* currentDocsWithWord(). This method cannot be consistently
* implemented for this iterator and expect it to behave nicely
* it this iterator is used together with intersect_iterator. So
* to prevent a user for doing this, calling this method results
* in a user defined error
*
* @param int $num the maximum number of results that can be returned by
* a block
*/
public function setResultsPerBlock($num) {
trigger_error("Cannot set the results per block of
a union iterator", E_USER_ERROR);
}
/**
* This method is supposed to get the doc_offset and generation
* for the next document that would be return by
* this iterator.
*
* @return mixed the desired document offset and generation.
*/
public function currentGenDocOffsetWithWord() {
$gen_doc_offset = -1;
$index_bundle_iterators = $this->index_bundle_iterators;
foreach ($index_bundle_iterators as $iterator) {
$gen_doc_offset = $iterator->currentGenDocOffsetWithWord();
if ($gen_doc_offset != -1) {
break;
}
}
return $gen_doc_offset;
}
}