Last commit for src/library/FeedArchiveBundle.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\library;

use seekquarry\yioop\configs as C;

/**
 * Used for crawlLog, crawlHash, and garbageCollect
 */
require_once __DIR__ . '/Utility.php';
/**
 * Subclass of IndexArchiveBundle with bloom filters to make it easy to check
 * if a news feed item has been added to the bundle already before adding it
 *
 * @author Chris Pollett
 */
class FeedArchiveBundle extends IndexArchiveBundle
{
    /**
     * Used to store unique identifiers of feed items that have been stored
     * in this FeedArchiveBundle. This filter_a is used for checking if items
     * are already in the archive, when it has URL_FILTER_SIZE/2 items
     * filter_b is added to as well as filter_a. When filter_a is of size
     * URL_FILTER_SIZE filter_a is deleted, filter_b is renamed to filter_a
     * and the process is repeated.
     * @var BloomFilterFile
     */
    public $filter_a;
    /**
     * Auxiliary BloomFilterFile used in checking if feed items are in this
     * archive or not. @see $filter_a
     * @var BloomFilterFile
     */
    public $filter_b;

    /**
     * Makes or initializes an FeedArchiveBundle with the provided parameters
     *
     * @param string $dir_name folder name to store this bundle
     * @param bool $read_only_archive whether to open archive only for reading
     *  or reading and writing
     * @param string $description a text name/serialized info about this
     *      IndexArchiveBundle
     * @param int $num_docs_per_generation the number of pages to be stored
     *      in a single shard
     */
    public function __construct($dir_name, $read_only_archive = true,
        $description = null, $num_docs_per_generation =
        C\NUM_DOCS_PER_PARTITION)
    {
        parent::__construct($dir_name, $read_only_archive, $description,
            $num_docs_per_generation);
        if (file_exists($dir_name . "/filter_a.ftr")) {
            $this->filter_a = BloomFilterFile::load($dir_name .
                "/filter_a.ftr");
        } else {
            $this->filter_a = new BloomFilterFile($dir_name . "/filter_a.ftr",
                C\URL_FILTER_SIZE);
            set_error_handler(null);
            @chmod($dir_name . "/filter_a.ftr", 0755);
            set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
        }
        if (file_exists($dir_name . "/filter_b.ftr")) {
            $this->filter_a = BloomFilterFile::load($dir_name .
                "/filter_b.ftr");
        } else {
            $this->filter_b = null;
        }
    }
    /**
     * Add the array of $pages to the summaries WebArchiveBundle pages being
     * stored in the partition $generation and the field used
     * to store the resulting offsets given by $offset_field.
     *
     * @param int $generation field used to select partition
     * @param string $offset_field field used to record offsets after storing
     * @param string $key_field field used to store unique identifier for a
     *      each page item.
     * @param array &$pages data to store
     * @param int $visited_urls_count number to add to the count of visited urls
     *     (visited urls is a smaller number than the total count of objects
     *     stored in the index).
     */
    public function addPagesAndSeenKeys($generation, $offset_field, $key_field,
        &$pages, $visited_urls_count)
    {
        foreach ($pages as $page) {
            $key = $page[$key_field];
            $this->addFilters($key);
        }
        parent::addPages($generation, $offset_field, $pages,
            $visited_urls_count);
    }
    /**
     * Adds the key (often GUID) of a feed item to the bloom filter pair
     * associated with this archive. This always adds to filter a, if
     * filter a is more than half full it adds to filter b. If filter a is full
     * it is deletedand filter b is renamed filter a and te process continues
     * where a new filter b is created when this becomee half full.
     * @param string $key unique identifier of a feed item
     */
    public function addFilters($key)
    {
        if ($this->filter_a->count > C\URL_FILTER_SIZE/2 &&
            !$this->filter_b) {
            if (file_exists($this->dir_name . "/filter_b.ftr")) {
                $this->filter_b = BloomFilterFile::load($dir_name .
                    "/filter_b.ftr");
            } else {
                $this->filter_b = new BloomFilterFile(
                    $this->dir_name . "/filter_b.ftr", C\URL_FILTER_SIZE);
                chmod($dir_name . "/filter_a.ftr", 0755);
            }
        }
        if ($this->filter_a->count > C\URL_FILTER_SIZE) {
            unlink($this->dir_name . "/filter_a.ftr");
            rename($this->dir_name . "/filter_b.ftr",
                $this->dir_name . "/filter_a.ftr");
        }
        $this->filter_a->add($key);
        if ($this->filter_b) {
            $this->filter_b->add($key);
        }
    }
    /**
     * Whether the active filter for this feed contain thee feed item
     * of thee supplied key
     * @param string $key the feed item id to check if in archive
     * @return bool true if it is in the archive, false otherwise
     */
    public function contains($key)
    {
        return $this->filter_a->contains($key);
    }
    /**
     * Forces the current shard to be saved
     */
    public function forceSave()
    {
        $this->getActiveShard()->save(false, true);
        $this->filter_a->save();
        chmod($this->dir_name . "/filter_a.ftr", 0755);
        if ($this->filter_b) {
            $this->filter_b->save();
            chmod($this->dir_name . "/filter_b.ftr", 0755);
        }
    }
}
ViewGit