Last commit for src/library/BloomFilterBundle.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\library;

/**
 * Used for garbageCollect
 */
require_once __DIR__ . '/Utility.php';
/**
 *
 * A BloomFilterBundle is a directory of BloomFilterFile.
 * The filter bundle, like a Bloom filter, also acts as a set,
 * but once the active filter in it fills up a new filter is
 * added to the bundle so that more data can be stored.
 *
 * @author Chris Pollett
 * @see BloomFilterFile
 */
class BloomFilterBundle
{
    /**
     * Reference to the filter which will be used to store new data
     * @var object
     */
    public $current_filter;
    /**
     * Total number of filter that this filter bundle currently has
     * @var int
     */
    public $num_filters;
    /**
     * The number of items which have been stored in the current filter
     * @var int
     */
    public $current_filter_count;
    /**
     * The maximum capacity of a filter in this filter bundle
     * @var int
     */
    public $filter_size;
    /**
     * The folder name of this filter bundle
     * @var string
     */
    public $dir_name;
    /**
     * The default maximum size of a filter in a filter bundle
     */
    const default_filter_size = 10000000;
    /**
     * Creates or loads if already exists the directory structure and
     * BloomFilterFiles used by this bundle
     *
     * @param $dir_name directory when this bundles data is stored
     * @param $filter_size the size of an individual filter in this bundle
     *     once a filter is filled a new one is added to the directory
     */
    public function __construct($dir_name,
        $filter_size = self::default_filter_size )
    {
        $this->dir_name = $dir_name;
        if (!is_dir($dir_name)) {
            mkdir($dir_name);
        }
        $this->loadMetaData();
        if ($this->num_filters == 0) {
            $this->current_filter =
                new BloomFilterFile($dir_name . "/filter_0.ftr", $filter_size);
            $this->num_filters++;
            $this->filter_size = $filter_size;
            $this->current_filter->save();
            $this->saveMetaData();
        } else {
            $last_filter = $this->num_filters - 1;
            $this->current_filter =
                BloomFilterFile::load($dir_name . "/filter_$last_filter.ftr");
        }
    }
    /**
     * Inserts a $value into the BloomFilterBundle
     *
     * This involves inserting into the current filter, if the filter
     * is full, a new filter is added before the value is added
     *
     * @param string $value a item to add to the filter bundle
     */
    public function add($value)
    {
        if ($this->current_filter_count >= $this->filter_size) {
            $this->current_filter->save();
            $this->current_filter = null;
            garbageCollect();
            $last_filter = $this->num_filters;
            $this->current_filter =
                new BloomFilterFile($this->dir_name .
                    "/filter_$last_filter.ftr", $this->filter_size);
            $this->current_filter_count = 0;
            $this->num_filters++;
            $this->saveMetaData();
        }
        $this->current_filter->add($value);
        $this->current_filter_count++;
    }
    /**
     * Removes from the passed array those elements $elt who either are in
     * the filter bundle or whose $elt[$field_name] is in the bundle.
     *
     * @param array &$arr the array to remove elements from
     * @param array $field_names if not null an array of field names of $arr
     *     to use to do filtering. If null assumes elts of $arr are strings
     *      and directly checks those stings.
     */
    public function differenceFilter(&$arr, $field_names = null)
    {
        $incremental_time = microtime(true);
        $num_filters = $this->num_filters;
        $count = count($arr);
        for ($i = 0; $i < $num_filters; $i++) {
            if ($i == $num_filters - 1) {
                $tmp_filter = $this->current_filter;
            } else {
                $tmp_filter = BloomFilterFile::load($this->dir_name .
                    "/filter_$i.ftr");
            }
            if (empty($tmp_filter)) {
                continue;
            }
            for ($j = 0; $j < $count; $j++) {
                if ($field_names === null) {
                    $tmp = & $arr[$j];
                    if ($tmp !== false && $tmp_filter->contains($tmp)) {
                    /*
                        We deliberately don't try to add anything that has
                        the hash field set to false. This is our cue to
                        skip an element such as a link document which we
                        know will almost always be unique and so be unnecessary
                        to de-duplicate
                     */
                        unset($arr[$j]);
                    }
                } else { //now do the same strategy for the array of fields case
                    foreach ($field_names as $field_name) {
                        $tmp = & $arr[$j][$field_name];
                        if ($tmp !== false && $tmp_filter->contains($tmp)) {
                            unset($arr[$j]);
                            break;
                        }
                    }
                }
                if (changeInMicrotime($incremental_time) > 30) {
                    crawlLog("..Processing item $j of $count from filter ".
                        "number $i of $num_filters.");
                    $incremental_time = microtime(true);
                }
            }
        }
    }
    /**
     * Loads from the filter bundles' meta.txt the meta data associated with
     * this filter bundle and stores this data into field variables
     */
    public function loadMetaData()
    {
        if (file_exists($this->dir_name . '/meta.txt')) {
            $meta = unserialize(
                file_get_contents($this->dir_name . '/meta.txt') );
            $this->num_filters = $meta['NUM_FILTERS'];
            $this->current_filter_count = $meta['CURRENT_FILTER_COUNT'];
            $this->filter_size = $meta['FILTER_SIZE'];
        } else {
            $this->num_filters = 0;
            $this->current_filter_count = 0;
            $this->filter_size = self::default_filter_size;
        }
    }
    /**
     * Saves the meta data (number of filter, number of items stored, and size)
     * of the bundle
     */
    public function saveMetaData()
    {
        $meta = [];
        $meta['NUM_FILTERS'] = $this->num_filters;
        $meta['CURRENT_FILTER_COUNT' ]= $this->current_filter_count;
        $meta['FILTER_SIZE'] = $this->filter_size;
        file_put_contents($this->dir_name . '/meta.txt', serialize($meta));
    }
    /**
     * Empties the contents of the bloom filter bundle and resets
     * it to start storing new data.
     */
    public function reset()
    {
        for ($i = 0; $i < $this->num_filters; $i++) {
            @unlink($this->dir_name . "/filter_$i.ftr");
        }
        $this->num_filters = 0;
        $this->current_filter_count = 0;
        $this->current_filter =
            new BloomFilterFile($this->dir_name . "/filter_0.ftr",
            $this->filter_size);
        $this->num_filters++;
        $this->current_filter->save();
        $this->saveMetaData();
    }
    /**
     * Used to save to disk all the file data associated with this bundle
     */
    public function forceSave()
    {
        $this->saveMetaData();
        $this->current_filter->save();
    }
}
ViewGit