viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/configs/Config.php b/src/configs/Config.php index afed87782..395323608 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -603,6 +603,12 @@ nsconddefine("NUMBER_OF_LOG_FILES", 5); * file before re-requesting it */ nsconddefine('CACHE_ROBOT_TXT_TIME', ONE_DAY); +/** + * QueueServer cache's in ram up to this many robots.txt files + * to speed up checking if a url is okay to crawl. All robots.txt + * files are kept on disk, but might be slower to access if not in cache. + */ +nsconddefine('SIZE_ROBOT_TXT_CACHE', 2000); /** * Whether the scheduler should track ETag and Expires headers. * If you want to turn this off set the variable to false in diff --git a/src/controllers/ResourceController.php b/src/controllers/ResourceController.php index bfec9f1cc..fb538e817 100644 --- a/src/controllers/ResourceController.php +++ b/src/controllers/ResourceController.php @@ -269,7 +269,11 @@ class ResourceController extends Controller implements CrawlConstants { $current_start = $start; $current_end = $end; - list(, $range) = explode('=', $_SERVER['HTTP_RANGE'], 2); + $pre_range_parts = explode('=', $_SERVER['HTTP_RANGE'], 2); + $range = ","; + if (count($pre_range_parts) == 2) { + list(, $range) = $pre_range_parts; + } if (strpos($range, ',') !== false) { $this->web_site->header( 'HTTP/1.1 416 Requested Range Not Satisfiable'); @@ -304,14 +308,14 @@ class ResourceController extends Controller implements CrawlConstants $this->web_site->header("Content-Range: bytes $start-$end/$size"); $this->web_site->header("Content-Length: ".$length); $buffer = 8192; - $position = 0; + $position = ftell($fp); while(!feof($fp) && $position <= $end && connection_status() == 0) { - $position = ftell($fp); if ($position + $buffer > $end) { $buffer = $end - $position + 1; } echo fread($fp, $buffer); flush(); + $position = ftell($fp); } fclose($fp); } diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index 694e97745..da5dd64f0 100755 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -1845,7 +1845,7 @@ class QueueServer implements CrawlConstants, Join $info = []; $info[self::STATUS] = self::CONTINUE_STATE; if (file_exists(C\CRAWL_DIR."/schedules/".self::schedule_start_name)) { - L\crawlLog("Scheduler Start schedule urls".C\CRAWL_DIR . + L\crawlLog("Scheduler Start schedule urls" . C\CRAWL_DIR . "/schedules/".self::schedule_start_name); $this->processDataArchive( C\CRAWL_DIR."/schedules/".self::schedule_start_name); @@ -2000,7 +2000,7 @@ class QueueServer implements CrawlConstants, Join } } $this->web_queue->notifyFlush(); - L\crawlLog(" time: ".L\changeInMicrotime($start_time)); + L\crawlLog(" time: " . L\changeInMicrotime($start_time)); L\crawlLog("C.. Scheduler: Add urls to queue"); $start_time = microtime(true); /* @@ -2009,7 +2009,7 @@ class QueueServer implements CrawlConstants, Join */ $this->web_queue->addUrlsQueue($added_pairs); } - L\crawlLog("Scheduler: time: ".L\changeInMicrotime($start_time)); + L\crawlLog("Scheduler: time: " . L\changeInMicrotime($start_time)); L\crawlLog("Scheduler: Done queue schedule file: $file"); unlink($file); } diff --git a/src/library/FetchUrl.php b/src/library/FetchUrl.php index 1f1974275..c218e4529 100755 --- a/src/library/FetchUrl.php +++ b/src/library/FetchUrl.php @@ -150,6 +150,8 @@ class FetchUrl implements CrawlConstants */ } curl_setopt($sites[$i][0], CURLOPT_FOLLOWLOCATION, $follow); + curl_setopt($sites[$i][0], CURLOPT_HTTP_VERSION, + CURL_HTTP_VERSION_2_0); curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYHOST, 0); curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYPEER, false); curl_setopt($sites[$i][0], CURLOPT_AUTOREFERER, true); @@ -816,6 +818,8 @@ class FetchUrl implements CrawlConstants curl_setopt($agents[$host], CURLOPT_AUTOREFERER, true); curl_setopt($agents[$host], CURLOPT_FOLLOWLOCATION, true); // these next two lines should probably be modified for better security + curl_setopt($agents[$host], CURLOPT_HTTP_VERSION, + CURL_HTTP_VERSION_2_0); curl_setopt($agents[$host], CURLOPT_SSL_VERIFYHOST, 0); curl_setopt($agents[$host], CURLOPT_SSL_VERIFYPEER, false); curl_setopt($agents[$host], CURLOPT_NOSIGNAL, true); diff --git a/src/library/VersionManager.php b/src/library/VersionManager.php index df3549bda..57d4abd1e 100644 --- a/src/library/VersionManager.php +++ b/src/library/VersionManager.php @@ -58,34 +58,52 @@ class VersionManager const LOCK_FAIL = -1; const SUCCESS = 1; /** + * Name of subfolder in which to store files when a version of the managed + * folder is created. * @var string */ public $archive_name; /** + * Filesystem path to the archive folder * @var string */ public $archive_path; /** + * Filesystem path to file that is used for locking whether a new + * VersionManager public method is allowed to manipulate files in the + * archive. * @var string */ public $lock_file; /** + * Path to folder in archive in which a list of all versions is maintained * @var string */ public $versions_path; /** + * Hash algorithm to be applied to folder's files inorder to come up with + * a name to store in a version in the archive. * @var string */ public $hash_algorithm; /** + * Folder that is being managed (prior versions of files in it being + * maintained) by this VersionManager instance * @var string */ public $managed_folder; /** + * File system permissions to use when storing version files into the + * version archive. If <=0 then use default file permissions * @var string */ public $permissions; /** + * Creates an object which can be used to manage multiple versions of + * files in a managed folder, storing prior version in an archive folder + * using a hash_algorithm to determine how to name these archived files + * and saving these archived files according to some file system + * permissions. * * @param string $managed_folder what folder should be managed with * this versioning system @@ -96,7 +114,8 @@ class VersionManager * @param int $permissions what to set the file permissions to for the * archive file. To keep things simple this defaults to world * read write. In practice you probably want to tailor this to - * the situation for security + * the situation for security. If you set the value to <= 0 + * the permissions will be whatever your OS would use by default */ public function __construct($managed_folder = '.', $archive_name = '.archive', $hash_algorithm = 'sha256', @@ -114,7 +133,9 @@ class VersionManager as $path) { if (!file_exists($path)) { mkdir($path); - chmod($path, $this->permissions); + if ($this->permissions > 0) { + chmod($path, $this->permissions); + } } } $this->createVersion("", "", 1); @@ -126,7 +147,7 @@ class VersionManager * folder will be made. If $file_changed is a nonexistent file in $folder * then the dir's in path to $file_changed will be updated. * - * @param string $file_changed + * @param string $file_changed * @param string $folder * @param int $now * @param bool $lock whether or not a lock should be obtained before @@ -410,8 +431,13 @@ class VersionManager return self::SUCCESS; } /** + * Returns the files in the root directory in the most recent version of the + * repository together with a TIMESTAMP of the date when the most recent + * version was made. * @param bool $lock whether or not a lock should be obtained before * carrying out the operation + * @return mixed either any array [TIMESTAMP => time of last version, + * FILES => files in last version's folder] or LOCK_FAIL error code */ public function headInfo($lock = true) { @@ -423,9 +449,15 @@ class VersionManager return unserialize(file_get_contents("$version_path/HEAD")); } /** + * Retrieves the contents of a file from a particular version of the + * repository * @param string $file name of file to get data about + * @param int $timestamp which version want to get file out of + * @param bool $get_nearest_version if true then if $timestamp doesn't + * exist as a version get the nearest version after $timestamp * @param bool $lock whether or not a lock should be obtained before * carrying out the operation + * @return mixed either a string with the file's data or an error code */ public function versionGetContents($file, $timestamp, $get_nearest_version = false, $lock = true) @@ -460,6 +492,7 @@ class VersionManager * carrying out the operation * @param bool $force_lock whether or not any existing lock should be * ignored + * @return int success code */ public function restoreVersion($timestamp, $lock = true, $force_lock = false) @@ -661,7 +694,9 @@ class VersionManager $archive_path .= "/$prefix"; if ($make_path && !file_exists($archive_path)) { mkdir($archive_path); - chmod($archive_path, $this->permissions); + if ($this->permissions > 0) { + chmod($archive_path, $this->permissions); + } } } return [$hash_name, $archive_path]; @@ -679,7 +714,9 @@ class VersionManager $version_path .= "/$prefix"; if ($make_path && !file_exists($version_path)) { mkdir($version_path); - chmod($version_path, $this->permissions); + if ($this->permissions > 0) { + chmod($version_path, $this->permissions); + } } } return $version_path; diff --git a/src/library/WebArchive.php b/src/library/WebArchive.php index 5fc1e9725..f4aefc915 100755 --- a/src/library/WebArchive.php +++ b/src/library/WebArchive.php @@ -35,7 +35,7 @@ use seekquarry\yioop\configs as C; /** * Loads crawlLog functions if needed */ -require_once __DIR__."/Utility.php"; +require_once __DIR__ . "/Utility.php"; /** * * Code used to manage web archive files @@ -179,13 +179,10 @@ class WebArchive $info_string = $this->compressor->compress(serialize($info_block)); $len = strlen($info_string) + $compressed_int_len; - $offset = ftell($fh); ftruncate($fh, $offset); - $out = $info_string.$this->compressor->compressInt($len); fwrite($fh, $out, $len); - if ($open_flag) { fclose($fh); } @@ -309,9 +306,12 @@ class WebArchive * @param int $num number of objects to return * @param bool $next_flag whether to advance the archive iterator * @param resource $fh either null or a file resource to the archive + * @param int $max_size maximum size returned object should be, + * use as a sanity check against corrupted archives * @return array the $num objects beginning at $offset */ - public function getObjects($offset, $num, $next_flag = true, $fh = null) + public function getObjects($offset, $num, $next_flag = true, $fh = null, + $max_size = C\MAX_ARCHIVE_OBJECT_SIZE) { $open_flag = false; if ($fh == null) { @@ -324,7 +324,8 @@ class WebArchive if ($is_string) { $storage_len = strlen($this->storage); } - if ((!$is_string &&fseek($fh, $offset) == 0 ) || ($is_string + set_error_handler(null); + if ((!$is_string && fseek($fh, $offset) == 0 ) || ($is_string && $offset < $storage_len)) { for ($i = 0; $i < $num; $i++) { if (!$is_string && feof($fh)) {break; } @@ -334,15 +335,13 @@ class WebArchive ? substr($this->storage, $offset, $compressed_int_len) : fread($fh, $compressed_int_len); $len = $this->compressor->uncompressInt($compressed_len); - if ($len > 0 && $len < C\MAX_ARCHIVE_OBJECT_SIZE) { + if ($len > 0 && $len < $max_size) { $compressed_file = ($is_string) ? substr($this->storage, $offset + $compressed_int_len, $len) : fread($fh, $len); - set_error_handler(null); $file = $this->compressor->uncompress($compressed_file); $object = @unserialize($file); - set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); $offset += $compressed_int_len + $len; $objects[] = [$offset, $object]; } else { @@ -354,6 +353,7 @@ class WebArchive $this->iterator_pos = $offset; } } + set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); if ($open_flag) { $this->close($fh); } diff --git a/src/library/WebQueueBundle.php b/src/library/WebQueueBundle.php index 0d4589c99..76dbf79f0 100755 --- a/src/library/WebQueueBundle.php +++ b/src/library/WebQueueBundle.php @@ -268,23 +268,22 @@ class WebQueueBundle implements Notifier $robot_archive_name, new NonCompressor(), false, true); if (file_exists($dir_name . "/robot.dat")) { $this->robot_table = - HashTable::load($dir_name."/robot.dat"); + HashTable::load($dir_name . "/robot.dat"); } else { $this->robot_table = new HashTable($dir_name. "/robot.dat", 16*$num_urls_ram, self::HASH_KEY_SIZE, self::INT_SIZE); } //filter to check for and determine crawl delay - if (file_exists($dir_name."/crawl_delay.ftr")) { + if (file_exists($dir_name . "/crawl_delay.ftr")) { $this->crawl_delay_filter = - BloomFilterFile::load($dir_name."/crawl_delay.ftr"); + BloomFilterFile::load($dir_name . "/crawl_delay.ftr"); } else { $this->crawl_delay_filter = new BloomFilterFile($dir_name."/crawl_delay.ftr", $filter_size); } //Initialize B-Tree for storing cache page validation data - $this->etag_btree = new BTree($dir_name.'/etag_expires_tree'); - + $this->etag_btree = new BTree($dir_name . '/etag_expires_tree'); $this->notify_buffer = []; } /** @@ -442,7 +441,8 @@ class WebQueueBundle implements Notifier list($probe, $data) = $both; $offset = unpackInt(substr($data, 0 , 4)); $flag = unpackInt(substr($data, 8 , 4)); - $url_obj = $this->to_crawl_archive->getObjects($offset, 1, true, $fh); + $url_obj = $this->to_crawl_archive->getObjects($offset, 1, true, $fh, + 2 * C\MAX_URL_LEN); if (isset($url_obj[0][1][0])) { $url = $url_obj[0][1][0]; } else { @@ -569,19 +569,27 @@ class WebQueueBundle implements Notifier { // local cache of recent robot.txt stuff static $robot_cache = []; - $cache_size = 2000; + static $robot_cache_times = []; list($host, $path) = UrlParser::getHostAndPath($url, true, true); $path = urldecode($path); $key = crawlHash($host, true); if (isset($robot_cache[$key])) { $robot_object = $robot_cache[$key]; + $robot_cache_times[$key] = microtime(true); } else { $data = $this->robot_table->lookup($key); $offset = unpackInt($data); - $robot_object = $this->robot_archive->getObjects($offset, 1); + $robot_object = $this->robot_archive->getObjects($offset, 1, + true, null, C\PAGE_RANGE_REQUEST); $robot_cache[$key] = $robot_object; - if (count($robot_cache) > $cache_size) { - array_shift($robot_cache); + $cache_now = microtime(true); + $robot_cache_times[$key] = $cache_now; + if (count($robot_cache) > C\SIZE_ROBOT_TXT_CACHE) { + asort($robot_cache_times); + reset($robot_cache_times); + $evict_key = key($robot_cache_times); + unset($robot_cache_times[$evict_key], + $robot_cache[$evict_key]); } } $robot_paths = (isset($robot_object[0][1])) ? $robot_object[0][1] @@ -670,7 +678,7 @@ class WebQueueBundle implements Notifier */ public function setCrawlDelay($host, $value) { - $this->crawl_delay_filter->add("-1".$host); + $this->crawl_delay_filter->add("-1" . $host); //used to say a crawl delay has been set for ($i = 0; $i < 8; $i++) { if (($value & 1) == 1) { @@ -690,7 +698,6 @@ class WebQueueBundle implements Notifier if (!$this->crawl_delay_filter->contains("-1".$host)) { return -1; } - $value = 0; for ($i = 0; $i < 8; $i++) { if ($this->crawl_delay_filter->contains("$i".$host)) { @@ -717,7 +724,6 @@ class WebQueueBundle implements Notifier return new HashTable($name, $num_values, self::HASH_KEY_SIZE, self::HASH_VALUE_SIZE); } - /** * Looks up $key in the to-crawl hash table * @@ -825,7 +831,6 @@ class WebQueueBundle implements Notifier { crawlLog("Rebuilding URL table"); $dir_name = $this->dir_name; - $count = $this->to_crawl_queue->count; $tmp_archive_name = $dir_name."/tmp_archive" . NonCompressor::fileExtension();