Last commit for src/library/index_bundle_iterators/NetworkIterator.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2018  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2018
 * @filesource
 */
namespace seekquarry\yioop\library\index_bundle_iterators;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\library\CrawlConstants;
use seekquarry\yioop\library\FetchUrl;
use seekquarry\yioop\library\AnalyticsManager;

/**
 * This iterator is used to handle querying a network of queue_servers
 * with regard to a query
 *
 * @author Chris Pollett
 */
class NetworkIterator extends IndexBundleIterator
{
    /**
     * Part of query without limit and num to be processed by all queue_server
     * machines
     *
     * @var string
     */
    public $base_query;
    /**
     * Current limit number to be added to base query
     *
     * @var string
     */
    public $limit;
    /**
     * An array of servers to ask a query to
     *
     * @var string
     */
    public $queue_servers;
    /**
     * Flags for each server saying if there are more results for that server
     * or not
     *
     * @var array
     */
    public $more_results;
    /**
     * Keeps track of whether the word_iterator list is empty because the
     * word does not appear in the index shard
     * @var int
     */
    public $filter;
    /**
     * used to adaptively change the number of pages requested from each
     * machine based on the number of machines that still have results
     * @var int
     */
    public $next_results_per_block;
    /**
     * Used to keep track of the original desired number of results to be
     * returned in one find docs call versus the number actually retrieved.
     * @var int
     */
    public $hard_query;
    /**
     * the minimum number of pages to group from a block;
     */
    const MIN_FIND_RESULTS_PER_BLOCK = 200;
    /** Host Key position + 1 (first char says doc, inlink or eternal link)*/
    const HOST_KEY_POS = 17;
    /** Length of a doc key*/
    const KEY_LEN = 8;
    /**
     * Creates a network iterator with the given parameters.
     *
     * @param string $query the query that was supplied by the end user
     *      that we are trying to get search results for
     * @param array $queue_servers urls of yioop instances on which documents
     *      indexes live
     * @param string $timestamp the timestamp of the particular current index
     *      archive bundles that we look in for results
     * @param array& $filter an array of hashes of domains to filter from
     *      results
     * @param string $save_timestamp_name if this timestamp is nonzero, then
     *      when making queries to separate machines the save_timestamp is sent
     *      so the queries on those machine can make savepoints. Note the
     *      format of save_timestamp is timestamp-query_part where query_part
     *      is the number of the item in a query presentation (usually 0).
     */
    public function __construct($query, $queue_servers, $timestamp,
        &$filter = null, $save_timestamp_name = "")
    {
        $this->results_per_block = ceil(self::MIN_FIND_RESULTS_PER_BLOCK);
        $this->next_results_per_block = $this->results_per_block;
        $this->hard_query = false;
        $this->base_query = "q=".urlencode($query).
            "&f=serial&network=false&raw=1&its=$timestamp&guess=false";
        if ($save_timestamp_name!="") {
            // used for archive crawls of crawl mixes
            $this->base_query .= "&save_timestamp=$save_timestamp_name";
        }
        $this->queue_servers = $queue_servers;
        $this->limit = 0;
        $count = count($this->queue_servers);
        for ($i = 0; $i < $count; $i++) {
            $this->more_flags[$i] = true;
        }
        if ($filter != null) {
            $this->filter = & $filter;
        } else {
            $this->filter = null;
        }
    }
    /**
     * Computes a relevancy score for a posting offset with respect to this
     * iterator and generation As this is not easily determined
     * for a network iterator, this method always returns 1.0 for this
     * iterator
     *
     * @param int $generation the generation the posting offset is for
     * @param int $posting_offset an offset into word_docs to compute the
     *     relevance of
     * @return float a relevancy score based on BM25F -- always 1.0.
     */
    public function computeRelevance($generation, $posting_offset)
    {
        return 1.0;
    }
    /**
     * Returns the iterators to the first document block that it could iterate
     * over
     */
    public function reset()
     {
        $this->limit = 0;
        $this->next_results_per_block = $this->results_per_block;
        $count = count($this->queue_servers);
        $this->hard_query = false;
        for ($i = 0; $i < $count; $i++) {
            $this->more_flags[$i] = true;
        }
     }
    /**
     * Forwards the iterator one group of docs
     * @param array $gen_doc_offset a generation, doc_offset pair. If set,
     *     the must be of greater than or equal generation, and if equal the
     *     next block must all have $doc_offsets larger than or equal to
     *     this value
     */
    public function advance($gen_doc_offset = null)
     {
        $this->current_block_fresh = false;
        $this->limit += $this->results_per_block;
        $this->results_per_block = $this->next_results_per_block;
     }
    /**
     * Gets the doc_offset and generation for the next document that
     * would be return by this iterator. As this is not easily determined
     * for a network iterator, this method always returns -1 for this
     * iterator
     *
     * @return mixed an array with the desired document offset
     * and generation; -1 on fail
     */
    public function currentGenDocOffsetWithWord()
    {
        return -1;
    }
    /**
     * Hook function used by currentDocsWithWord to return the current block
     * of docs if it is not cached
     *
     * @return mixed doc ids and score if there are docs left, -1 otherwise
     */
     public function findDocsWithWord()
     {
        $query = $this->base_query .
            "&num={$this->results_per_block}&limit={$this->limit}";
        $sites = [];
        $lookup = [];
        $i = 0;
        $j = 0;
        foreach ($this->queue_servers as $server) {
            if ($this->more_flags[$i]) {
                $sites[$j][CrawlConstants::URL] = $server ."?". $query.
                    "&machine=$i";
                $lookup[$j] = $i;
                $j++;
            }
            $i++;
        }
        $net_times = AnalyticsManager::get("NET_TIMES");
        $net_times = ($net_times) ? $net_times : 0;
        $download_time = microtime(true);
        $downloads = [];
        if (count($sites) > 0) {
            $downloads = FetchUrl::getPages($sites, false, 0, null, self::URL,
                self::PAGE, true);
        }
        $net_times += L\changeInMicrotime($download_time);
        AnalyticsManager::set("NET_TIMES", $net_times);
        $results = [];
        $count = count($downloads);
        $this->num_docs = 0;
        $in4 = "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;";
        $machine_times = AnalyticsManager::get("MACHINE_TIMES");
        $indent = ($machine_times) ? "<br />$in4" : $in4;
        $machine_times = ($machine_times) ? $machine_times: "";
        $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES");
        $max_machine_times = ($max_machine_times) ? $max_machine_times : 0;
        $max_time = 0;
        $num_with_results = $count;
        for ($j = 0; $j < $count; $j++) {
            $download = $downloads[$j];
            if (isset($download[self::PAGE]) && $download[self::PAGE]) {
                set_error_handler(null);
                $pre_result = @unserialize($download[self::PAGE]);
                set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
                if (!isset($pre_result["TOTAL_ROWS"]) ||
                    $pre_result["TOTAL_ROWS"] < $this->results_per_block) {
                    $this->more_flags[$lookup[$j]] = false;
                    $num_with_results--;
                }
                if (isset($pre_result["TOTAL_ROWS"])) {
                    $this->num_docs += $pre_result["TOTAL_ROWS"];
                }
                if (isset($pre_result["PAGES"])) {
                    foreach ($pre_result["PAGES"] as $page_data) {
                        if (isset($page_data[self::KEY])) {
                            $results[$page_data[self::KEY]] =
                                $page_data;
                            $results[$page_data[self::KEY]][self::MACHINE_ID] =
                                $lookup[$j];
                        }
                    }
                }
                $max_time = max($max_time, $pre_result['TOTAL_TIME']);
                $lookup_link = $this->makeLookupLink($sites, $lookup[$j]);
                $machine_times .= $in4 . $lookup_link . " ".
                    number_format($pre_result['ELAPSED_TIME'], 6) . "/" .
                    number_format($pre_result['TOTAL_TIME'], 6) . "<br />";
            }
        }
        $machine_times = substr( $machine_times, 0, -strlen("<br />"));
        if (isset($pre_result["HARD_QUERY"])) {
            $this->hard_query  = $pre_result["HARD_QUERY"];
        }
        if ($num_with_results > 0) {
            $this->next_results_per_block = ceil(
                floatval($count * $this->results_per_block)/
                floatval($num_with_results));
        }
        $max_machine_times += $max_time;
        AnalyticsManager::set("MACHINE_TIMES", $machine_times);
        AnalyticsManager::set("MAX_MACHINE_TIMES", $max_machine_times);
        if ($results == []) {
            $results = -1;
        }
        if ($results != -1) {
            if ($this->filter != null) {
                foreach ($results as $keys => $data) {
                    $host_key =
                        substr($keys, self::HOST_KEY_POS, self::KEY_LEN);
                    if (in_array($host_key, $this->filter) ) {
                        unset($results[$keys]);
                    }
                }
            }
        }
        $this->count_block = count($results);
        $this->pages = $results;
        return $results;
     }
    /**
     * Called to make a link for AnalyticsManager about a network query
     * performed by this iterator.
     *
     * @param array $sites used by this network iterator
     * @param int $index which site in array to make link for
     * @return string html of link
     */
    public function makeLookupLink($sites, $index)
    {
        if (isset($sites[$index][self::URL])) {
            $url = $sites[$index][self::URL];
            $title = $url;
        } else {
            if (!isset($sites[$index])) {
                $sites[$index] = [];
            }
            $tmp = urlencode(print_r($sites[$index],
                true));
            $title = 'URL not set';
            if (trim($tmp) == "") {
                $tmp = 'Site null';
            }
            $url = 'javascript:alert("'.$tmp.'")';
        }
        $link = "<a target='_blank' rel='noopener' class='gray-link'".
            " href='$url' title='$title' >ID_$index</a>:";
        return $link;
    }
    /**
     * Gets the summaries associated with the keys provided the keys
     * can be found in the current block of docs returned by this iterator
     * @param array $keys keys to try to find in the current block of returned
     *     results
     * @return array doc summaries that match provided keys
     */
    public function getCurrentDocsForKeys($keys = null)
    {
        if ($this->current_block_fresh == false) {
            $pages = $this->currentDocsWithWord();
            if (!is_array($pages)) {
                return $pages;
            }
        } else {
            $pages = & $this->pages;
        }
        return $pages;
    }
}
ViewGit