Last commit for src/library/archive_bundle_iterators/ArchiveBundleIterator.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\library\archive_bundle_iterators;

use seekquarry\yioop\library\CrawlConstants;

/**
 * Abstract class used to model iterating documents indexed in
 * an WebArchiveBundle or set of such bundles.
 *
 *
 * @author Chris Pollett
 * @see WebArchiveBundle
 */
abstract class ArchiveBundleIterator implements CrawlConstants
{
    /**
     * Timestamp of the archive that is being iterated over
     * @var int
     */
    public $iterate_timestamp;
    /**
     * Timestamp of the archive that is being used to store results in
     * @var int
     */
    public $result_timestamp;
    /**
     * Whether or not the iterator still has more documents
     * @var bool
     */
    public $end_of_iterator;
    /**
     * The path to the directory where the iteration status is stored.
     * @var string
     */
    public $result_dir;
    /**
     * Stores the current progress to the file iterate_status.txt in the result
     * dir such that a new instance of the iterator could be constructed and
     * return the next set of pages without having to process all of the pages
     * that came before. Each iterator should make a call to saveCheckpoint
     * after extracting a batch of pages.
     * @param array $info any extra info a subclass wants to save
     */
    public function saveCheckpoint($info = [])
    {
        $info['end_of_iterator'] = $this->end_of_iterator;
        $info['current_partition_num'] = $this->current_partition_num;
        $info['current_page_num'] = $this->current_page_num;
        $info['current_offset'] = $this->current_offset;
        file_put_contents("{$this->result_dir}/iterate_status.txt",
            serialize($info));
    }
    /**
     * Restores the internal state from the file iterate_status.txt in the
     * result dir such that the next call to nextPages will pick up from just
     * after the last checkpoint. Each iterator should make a call to
     * restoreCheckpoint at the end of the constructor method after the
     * instance members have been initialized.
     * @return array the data serialized when saveCheckpoint was called
     */
    public function restoreCheckpoint()
    {
        $info = unserialize(file_get_contents(
            "{$this->result_dir}/iterate_status.txt"));
        $this->end_of_iterator = $info['end_of_iterator'];
        $this->current_partition_num = $info['current_partition_num'];
        $this->current_offset = $info['current_offset'];
        return $info;
    }
    /**
     * Advances the iterator to the $limit page, with as little
     * additional processing as possible
     *
     * @param $limit page to advance to
     */
    public function seekPage($limit)
    {
        $this->reset();
        if ($limit > 0 ) {
            $this->nextPages($limit, true);
        }
    }
    /**
     * Estimates the important of the site according to the weighting of
     * the particular archive iterator
     * @param $site an associative array containing info about a web page
     * @return mixed a 4-bit number or false if iterator doesn't uses default
     *     ranking method
     */
    abstract function weight(&$site);
    /**
     * Gets the next $num many docs from the iterator
     * @param int $num number of docs to get
     * @param bool $no_process do not do any processing on page data
     * @return array associative arrays for $num pages
     */
    abstract function nextPages($num, $no_process = false);
    /**
     * Resets the iterator to the start of the archive bundle
     */
    abstract function reset();

}
ViewGit