Add curl flags for http2, add more version manager documentation and do tighter checks on WebArchive getObjects,a=chris

Chris Pollett [2018-04-20 23:Apr:th]
Add curl flags for http2, add more version manager documentation and do tighter checks on WebArchive getObjects,a=chris
Filename
src/configs/Config.php
src/controllers/ResourceController.php
src/executables/QueueServer.php
src/library/FetchUrl.php
src/library/VersionManager.php
src/library/WebArchive.php
src/library/WebQueueBundle.php
diff --git a/src/configs/Config.php b/src/configs/Config.php
index afed87782..395323608 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -603,6 +603,12 @@ nsconddefine("NUMBER_OF_LOG_FILES", 5);
  * file before re-requesting it
  */
 nsconddefine('CACHE_ROBOT_TXT_TIME', ONE_DAY);
+/**
+ * QueueServer cache's in ram up to this many robots.txt files
+ * to speed up checking if a url is okay to crawl. All robots.txt
+ * files are kept on disk, but might be slower to access if not in cache.
+ */
+nsconddefine('SIZE_ROBOT_TXT_CACHE', 2000);
 /**
  * Whether the scheduler should track ETag and Expires headers.
  * If you want to turn this off set the variable to false in
diff --git a/src/controllers/ResourceController.php b/src/controllers/ResourceController.php
index bfec9f1cc..fb538e817 100644
--- a/src/controllers/ResourceController.php
+++ b/src/controllers/ResourceController.php
@@ -269,7 +269,11 @@ class ResourceController extends Controller implements CrawlConstants
     {
         $current_start = $start;
         $current_end = $end;
-        list(, $range) = explode('=', $_SERVER['HTTP_RANGE'], 2);
+        $pre_range_parts = explode('=', $_SERVER['HTTP_RANGE'], 2);
+        $range = ",";
+        if (count($pre_range_parts) == 2) {
+            list(, $range) = $pre_range_parts;
+        }
         if (strpos($range, ',') !== false) {
             $this->web_site->header(
                 'HTTP/1.1 416 Requested Range Not Satisfiable');
@@ -304,14 +308,14 @@ class ResourceController extends Controller implements CrawlConstants
         $this->web_site->header("Content-Range: bytes $start-$end/$size");
         $this->web_site->header("Content-Length: ".$length);
         $buffer = 8192;
-        $position = 0;
+        $position = ftell($fp);
         while(!feof($fp) && $position <= $end && connection_status() == 0) {
-            $position = ftell($fp);
             if ($position + $buffer > $end) {
                 $buffer = $end - $position + 1;
             }
             echo fread($fp, $buffer);
             flush();
+            $position = ftell($fp);
         }
         fclose($fp);
     }
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 694e97745..da5dd64f0 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -1845,7 +1845,7 @@ class QueueServer implements CrawlConstants, Join
         $info = [];
         $info[self::STATUS] = self::CONTINUE_STATE;
         if (file_exists(C\CRAWL_DIR."/schedules/".self::schedule_start_name)) {
-            L\crawlLog("Scheduler Start schedule urls".C\CRAWL_DIR .
+            L\crawlLog("Scheduler Start schedule urls" . C\CRAWL_DIR .
                 "/schedules/".self::schedule_start_name);
             $this->processDataArchive(
                 C\CRAWL_DIR."/schedules/".self::schedule_start_name);
@@ -2000,7 +2000,7 @@ class QueueServer implements CrawlConstants, Join
                 }
             }
             $this->web_queue->notifyFlush();
-            L\crawlLog(" time: ".L\changeInMicrotime($start_time));
+            L\crawlLog(" time: " . L\changeInMicrotime($start_time));
             L\crawlLog("C.. Scheduler: Add urls to queue");
             $start_time = microtime(true);
             /*
@@ -2009,7 +2009,7 @@ class QueueServer implements CrawlConstants, Join
              */
             $this->web_queue->addUrlsQueue($added_pairs);
         }
-        L\crawlLog("Scheduler: time: ".L\changeInMicrotime($start_time));
+        L\crawlLog("Scheduler: time: " . L\changeInMicrotime($start_time));
         L\crawlLog("Scheduler: Done queue schedule file: $file");
         unlink($file);
     }
diff --git a/src/library/FetchUrl.php b/src/library/FetchUrl.php
index 1f1974275..c218e4529 100755
--- a/src/library/FetchUrl.php
+++ b/src/library/FetchUrl.php
@@ -150,6 +150,8 @@ class FetchUrl implements CrawlConstants
                                     */
                 }
                 curl_setopt($sites[$i][0], CURLOPT_FOLLOWLOCATION, $follow);
+                curl_setopt($sites[$i][0], CURLOPT_HTTP_VERSION,
+                    CURL_HTTP_VERSION_2_0);
                 curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYHOST, 0);
                 curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYPEER, false);
                 curl_setopt($sites[$i][0], CURLOPT_AUTOREFERER, true);
@@ -816,6 +818,8 @@ class FetchUrl implements CrawlConstants
         curl_setopt($agents[$host], CURLOPT_AUTOREFERER, true);
         curl_setopt($agents[$host], CURLOPT_FOLLOWLOCATION, true);
         // these next two lines should probably be modified for better security
+        curl_setopt($agents[$host], CURLOPT_HTTP_VERSION,
+            CURL_HTTP_VERSION_2_0);
         curl_setopt($agents[$host], CURLOPT_SSL_VERIFYHOST, 0);
         curl_setopt($agents[$host], CURLOPT_SSL_VERIFYPEER, false);
         curl_setopt($agents[$host], CURLOPT_NOSIGNAL, true);
diff --git a/src/library/VersionManager.php b/src/library/VersionManager.php
index df3549bda..57d4abd1e 100644
--- a/src/library/VersionManager.php
+++ b/src/library/VersionManager.php
@@ -58,34 +58,52 @@ class VersionManager
     const LOCK_FAIL = -1;
     const SUCCESS = 1;
     /**
+     * Name of subfolder in which to store files when a version of the managed
+     * folder is created.
      * @var string
      */
     public $archive_name;
     /**
+     * Filesystem path to the archive folder
      * @var string
      */
     public $archive_path;
     /**
+     * Filesystem path to file that is used for locking whether a new
+     * VersionManager public method is allowed to manipulate files in the
+     * archive.
      * @var string
      */
     public $lock_file;
     /**
+     * Path to folder in archive in which a list of all versions is maintained
      * @var string
      */
     public $versions_path;
     /**
+     * Hash algorithm to be applied to folder's files inorder to come up with
+     * a name to store in a version in the archive.
      * @var string
      */
     public $hash_algorithm;
     /**
+     * Folder that is being managed (prior versions of files in it being
+     * maintained) by this VersionManager instance
      * @var string
      */
     public $managed_folder;
     /**
+     * File system permissions to use when storing version files into the
+     * version archive. If <=0 then use default file permissions
      * @var string
      */
     public $permissions;
     /**
+     * Creates an object which can be used to manage multiple versions of
+     * files in a managed folder, storing prior version in an archive folder
+     * using a hash_algorithm to determine how to name these archived files
+     * and saving these archived files according to some file system
+     * permissions.
      *
      * @param string $managed_folder what folder should be managed with
      *      this versioning system
@@ -96,7 +114,8 @@ class VersionManager
      * @param int $permissions what to set the file permissions to for the
      *      archive file. To keep things simple this defaults to world
      *      read write. In practice you probably want to tailor this to
-     *      the situation for security
+     *      the situation for security. If you set the value to <= 0
+     *      the permissions will be whatever your OS would use by default
      */
     public function __construct($managed_folder = '.',
         $archive_name = '.archive', $hash_algorithm = 'sha256',
@@ -114,7 +133,9 @@ class VersionManager
                 as $path) {
                 if (!file_exists($path)) {
                     mkdir($path);
-                    chmod($path, $this->permissions);
+                    if ($this->permissions > 0) {
+                        chmod($path, $this->permissions);
+                    }
                 }
             }
             $this->createVersion("", "", 1);
@@ -126,7 +147,7 @@ class VersionManager
      * folder will be made. If $file_changed is a nonexistent file in $folder
      * then the dir's in path to $file_changed will be updated.
      *
-     * @param string $file_changed
+     * @param string $file_changed
      * @param string $folder
      * @param int $now
      * @param bool $lock whether or not a lock should be obtained before
@@ -410,8 +431,13 @@ class VersionManager
         return self::SUCCESS;
     }
     /**
+     * Returns the files in the root directory in the most recent version of the
+     * repository together with a TIMESTAMP of the date when the most recent
+     * version was made.
      * @param bool $lock whether or not a lock should be obtained before
      *      carrying out the operation
+     * @return mixed either any array [TIMESTAMP => time of last version,
+     *      FILES => files in last version's folder] or LOCK_FAIL error code
      */
     public function headInfo($lock = true)
     {
@@ -423,9 +449,15 @@ class VersionManager
         return unserialize(file_get_contents("$version_path/HEAD"));
     }
     /**
+     * Retrieves the contents of a file from a particular version of the
+     * repository
      * @param string $file name of file to get data about
+     * @param int $timestamp which version want to get file out of
+     * @param bool $get_nearest_version if true then if $timestamp doesn't
+     *      exist as a version get the nearest version after $timestamp
      * @param bool $lock whether or not a lock should be obtained before
      *      carrying out the operation
+     * @return mixed either a string with the file's data or an error code
      */
     public function versionGetContents($file, $timestamp,
         $get_nearest_version = false, $lock = true)
@@ -460,6 +492,7 @@ class VersionManager
      *      carrying out the operation
      * @param bool $force_lock whether or not any existing lock should be
      *      ignored
+     * @return int success code
      */
     public function restoreVersion($timestamp, $lock = true,
         $force_lock = false)
@@ -661,7 +694,9 @@ class VersionManager
             $archive_path .= "/$prefix";
             if ($make_path && !file_exists($archive_path)) {
                 mkdir($archive_path);
-                chmod($archive_path, $this->permissions);
+                if ($this->permissions > 0) {
+                    chmod($archive_path, $this->permissions);
+                }
             }
         }
         return [$hash_name, $archive_path];
@@ -679,7 +714,9 @@ class VersionManager
             $version_path .= "/$prefix";
             if ($make_path && !file_exists($version_path)) {
                 mkdir($version_path);
-                chmod($version_path, $this->permissions);
+                if ($this->permissions > 0) {
+                    chmod($version_path, $this->permissions);
+                }
             }
         }
         return $version_path;
diff --git a/src/library/WebArchive.php b/src/library/WebArchive.php
index 5fc1e9725..f4aefc915 100755
--- a/src/library/WebArchive.php
+++ b/src/library/WebArchive.php
@@ -35,7 +35,7 @@ use seekquarry\yioop\configs as C;
 /**
  * Loads crawlLog functions if needed
  */
-require_once __DIR__."/Utility.php";
+require_once __DIR__ . "/Utility.php";
 /**
  *
  * Code used to manage web archive files
@@ -179,13 +179,10 @@ class WebArchive
         $info_string =
             $this->compressor->compress(serialize($info_block));
         $len = strlen($info_string) + $compressed_int_len;
-
         $offset = ftell($fh);
         ftruncate($fh, $offset);
-
         $out = $info_string.$this->compressor->compressInt($len);
         fwrite($fh, $out, $len);
-
         if ($open_flag) {
             fclose($fh);
         }
@@ -309,9 +306,12 @@ class WebArchive
      * @param int $num number of objects to return
      * @param bool $next_flag whether to advance the archive iterator
      * @param resource $fh either null or a file resource to the archive
+     * @param int $max_size maximum size returned object should be,
+     *      use as a sanity check against corrupted archives
      * @return array the $num objects beginning at $offset
      */
-    public function getObjects($offset, $num, $next_flag = true, $fh = null)
+    public function getObjects($offset, $num, $next_flag = true, $fh = null,
+        $max_size = C\MAX_ARCHIVE_OBJECT_SIZE)
     {
         $open_flag = false;
         if ($fh == null) {
@@ -324,7 +324,8 @@ class WebArchive
         if ($is_string) {
             $storage_len = strlen($this->storage);
         }
-        if ((!$is_string &&fseek($fh, $offset) == 0 ) || ($is_string
+        set_error_handler(null);
+        if ((!$is_string && fseek($fh, $offset) == 0 ) || ($is_string
             && $offset < $storage_len)) {
             for ($i = 0; $i < $num; $i++) {
                 if (!$is_string && feof($fh)) {break; }
@@ -334,15 +335,13 @@ class WebArchive
                     ? substr($this->storage, $offset, $compressed_int_len)
                     : fread($fh, $compressed_int_len);
                 $len = $this->compressor->uncompressInt($compressed_len);
-                if ($len > 0 && $len < C\MAX_ARCHIVE_OBJECT_SIZE) {
+                if ($len > 0 && $len < $max_size) {
                     $compressed_file = ($is_string)
                         ? substr($this->storage, $offset + $compressed_int_len,
                             $len)
                         : fread($fh, $len);
-                    set_error_handler(null);
                     $file = $this->compressor->uncompress($compressed_file);
                     $object = @unserialize($file);
-                    set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
                     $offset += $compressed_int_len + $len;
                     $objects[] = [$offset, $object];
                 } else {
@@ -354,6 +353,7 @@ class WebArchive
                 $this->iterator_pos = $offset;
             }
         }
+        set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
         if ($open_flag) {
             $this->close($fh);
         }
diff --git a/src/library/WebQueueBundle.php b/src/library/WebQueueBundle.php
index 0d4589c99..76dbf79f0 100755
--- a/src/library/WebQueueBundle.php
+++ b/src/library/WebQueueBundle.php
@@ -268,23 +268,22 @@ class WebQueueBundle implements Notifier
             $robot_archive_name, new NonCompressor(), false, true);
         if (file_exists($dir_name . "/robot.dat")) {
             $this->robot_table =
-                HashTable::load($dir_name."/robot.dat");
+                HashTable::load($dir_name . "/robot.dat");
         } else {
             $this->robot_table =  new HashTable($dir_name.
                 "/robot.dat", 16*$num_urls_ram,
                  self::HASH_KEY_SIZE, self::INT_SIZE);
         }
         //filter to check for and determine crawl delay
-        if (file_exists($dir_name."/crawl_delay.ftr")) {
+        if (file_exists($dir_name . "/crawl_delay.ftr")) {
             $this->crawl_delay_filter =
-                BloomFilterFile::load($dir_name."/crawl_delay.ftr");
+                BloomFilterFile::load($dir_name . "/crawl_delay.ftr");
         } else {
             $this->crawl_delay_filter =
                 new BloomFilterFile($dir_name."/crawl_delay.ftr", $filter_size);
         }
         //Initialize B-Tree for storing cache page validation data
-        $this->etag_btree = new BTree($dir_name.'/etag_expires_tree');
-
+        $this->etag_btree = new BTree($dir_name . '/etag_expires_tree');
         $this->notify_buffer = [];
     }
     /**
@@ -442,7 +441,8 @@ class WebQueueBundle implements Notifier
         list($probe, $data) =  $both;
         $offset = unpackInt(substr($data, 0 , 4));
         $flag = unpackInt(substr($data, 8 , 4));
-        $url_obj = $this->to_crawl_archive->getObjects($offset, 1, true, $fh);
+        $url_obj = $this->to_crawl_archive->getObjects($offset, 1, true, $fh,
+            2 * C\MAX_URL_LEN);
         if (isset($url_obj[0][1][0])) {
             $url = $url_obj[0][1][0];
         } else {
@@ -569,19 +569,27 @@ class WebQueueBundle implements Notifier
     {
         // local cache of recent robot.txt stuff
         static $robot_cache = [];
-        $cache_size = 2000;
+        static $robot_cache_times = [];
         list($host, $path) = UrlParser::getHostAndPath($url, true, true);
         $path = urldecode($path);
         $key = crawlHash($host, true);
         if (isset($robot_cache[$key])) {
             $robot_object = $robot_cache[$key];
+            $robot_cache_times[$key] = microtime(true);
         } else {
             $data = $this->robot_table->lookup($key);
             $offset = unpackInt($data);
-            $robot_object = $this->robot_archive->getObjects($offset, 1);
+            $robot_object = $this->robot_archive->getObjects($offset, 1,
+                true, null, C\PAGE_RANGE_REQUEST);
             $robot_cache[$key] = $robot_object;
-            if (count($robot_cache) > $cache_size) {
-                array_shift($robot_cache);
+            $cache_now = microtime(true);
+            $robot_cache_times[$key] = $cache_now;
+            if (count($robot_cache) > C\SIZE_ROBOT_TXT_CACHE) {
+                asort($robot_cache_times);
+                reset($robot_cache_times);
+                $evict_key = key($robot_cache_times);
+                unset($robot_cache_times[$evict_key],
+                    $robot_cache[$evict_key]);
             }
         }
         $robot_paths = (isset($robot_object[0][1])) ? $robot_object[0][1]
@@ -670,7 +678,7 @@ class WebQueueBundle implements Notifier
      */
     public function setCrawlDelay($host, $value)
     {
-        $this->crawl_delay_filter->add("-1".$host);
+        $this->crawl_delay_filter->add("-1" . $host);
             //used to say a crawl delay has been set
         for ($i = 0; $i < 8; $i++) {
             if (($value & 1) == 1) {
@@ -690,7 +698,6 @@ class WebQueueBundle implements Notifier
         if (!$this->crawl_delay_filter->contains("-1".$host)) {
             return -1;
         }
-
         $value = 0;
         for ($i = 0; $i < 8; $i++) {
             if ($this->crawl_delay_filter->contains("$i".$host)) {
@@ -717,7 +724,6 @@ class WebQueueBundle implements Notifier
         return new HashTable($name, $num_values,
             self::HASH_KEY_SIZE, self::HASH_VALUE_SIZE);
     }
-
     /**
      * Looks up $key in the to-crawl hash table
      *
@@ -825,7 +831,6 @@ class WebQueueBundle implements Notifier
     {
         crawlLog("Rebuilding URL table");
         $dir_name = $this->dir_name;
-
         $count = $this->to_crawl_queue->count;
         $tmp_archive_name = $dir_name."/tmp_archive" .
             NonCompressor::fileExtension();
ViewGit