viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2024 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2024 * @filesource */ namespace seekquarry\yioop\library\archive_bundle_iterators; use seekquarry\yioop\library\CrawlConstants; /** * Abstract class used to model iterating documents indexed in * an WebArchiveBundle or set of such bundles. * * * @author Chris Pollett * @see WebArchiveBundle */ abstract class ArchiveBundleIterator implements CrawlConstants { /** * Used to interate over contents in a bzipped file * @var object */ public $bz2_iterator; /** * Used to store the name of compression that should be used when iterator. * For example, gzip, bzip, etc. * @var string */ public $compression; /** * If the archive uses a string of some kind to separate records, then * delimeter is a regular expression which will match the separator * @var string */ public $delimiter; /** * Default character encoding used by records in the archive. For example, * UTF-8 * @var string */ public $encoding; /** * Used to store fields of meta information which is needed in making * header information for each record processed. (such as base_address, * ip_address, lang, etc.) * @var array */ public $header; /** * Timestamp of the archive that is being iterated over * @var int */ public $iterate_timestamp; /** * Timestamp of the archive that is being used to store results in * @var int */ public $result_timestamp; /** * Whether or not the iterator still has more documents * @var bool */ public $end_of_iterator; /** * The path to the directory where the iteration status is stored. * @var string */ public $result_dir; /** * Stores the current progress to the file iterate_status.txt in the result * dir such that a new instance of the iterator could be constructed and * return the next set of pages without having to process all of the pages * that came before. Each iterator should make a call to saveCheckpoint * after extracting a batch of pages. * @param array $info any extra info a subclass wants to save */ public function saveCheckpoint($info = []) { $info['end_of_iterator'] = $this->end_of_iterator; $info['current_partition_num'] = $this->current_partition_num; $info['current_page_num'] = $this->current_page_num; $info['current_offset'] = $this->current_offset; file_put_contents("{$this->result_dir}/iterate_status.txt", serialize($info)); } /** * Restores the internal state from the file iterate_status.txt in the * result dir such that the next call to nextPages will pick up from just * after the last checkpoint. Each iterator should make a call to * restoreCheckpoint at the end of the constructor method after the * instance members have been initialized. * @return array the data serialized when saveCheckpoint was called */ public function restoreCheckpoint() { $info = unserialize(file_get_contents( "{$this->result_dir}/iterate_status.txt")); $this->end_of_iterator = $info['end_of_iterator']; $this->current_partition_num = $info['current_partition_num']; $this->current_offset = $info['current_offset']; return $info; } /** * Advances the iterator to the $limit page, with as little * additional processing as possible * * @param $limit page to advance to */ public function seekPage($limit) { $this->reset(); if ($limit > 0 ) { $this->nextPages($limit, true); } } /** * Estimates the important of the site according to the weighting of * the particular archive iterator * @param $site an associative array containing info about a web page * @return mixed a 4-bit number or false if iterator doesn't uses default * ranking method */ abstract function weight(&$site); /** * Gets the next $num many docs from the iterator * @param int $num number of docs to get * @param bool $no_process do not do any processing on page data * @return array associative arrays for $num pages */ abstract function nextPages($num, $no_process = false); /** * Resets the iterator to the start of the archive bundle */ abstract function reset(); }