Tweaks to FetchUrl to get work by default in Ubuntu18.04LTS, tweaks to default video sources, a=chris

Chris Pollett [2018-05-19 16:May:th]
Tweaks to FetchUrl to get work by default in Ubuntu18.04LTS, tweaks to default video sources, a=chris
Filename
src/configs/Createdb.php
src/configs/default_crawl.ini
src/controllers/Controller.php
src/data/public_default.db
src/executables/QueueServer.php
src/library/FetchUrl.php
diff --git a/src/configs/Createdb.php b/src/configs/Createdb.php
index ab55e516b..ee3bacb15 100755
--- a/src/configs/Createdb.php
+++ b/src/configs/Createdb.php
@@ -442,22 +442,19 @@ foreach ($new_user_activities as $new_activity) {
     }
 }
 $db->execute("INSERT INTO MEDIA_SOURCE VALUES ('1342634195',
-    'YouTube', 'video', 'http://www.youtube.com/watch?v={}',
+    'YouTube', 'video', 'https://www.youtube.com/watch?v={}',
     'http://i1.ytimg.com/vi/{}/default.jpg', '')");
 $db->execute("INSERT INTO MEDIA_SOURCE VALUES ('1342634196',
-    'MetaCafe', 'video', 'http://www.metacafe.com/watch/{}',
+    'MetaCafe', 'video', 'https://www.metacafe.com/watch/{}',
     '/resources/blank.png?{}', '')");
 $db->execute("INSERT INTO MEDIA_SOURCE VALUES ('1342634197',
-    'DailyMotion', 'video', 'http://www.dailymotion.com/video/{}',
+    'DailyMotion', 'video', 'https://www.dailymotion.com/video/{}',
     '/resources/blank.png?{}', '')");
 $db->execute("INSERT INTO MEDIA_SOURCE VALUES ('1342634198',
-    'Vimeo', 'video', 'http://player.vimeo.com/video/{}',
+    'Vimeo', 'video', 'https://player.vimeo.com/video/{}',
     '/resources/blank.png?{}', '')");
-$db->execute("INSERT INTO MEDIA_SOURCE VALUES ('1342634199',
-    'Break.com', 'video', '/resources/blank.png?{}', '" .
-    NAME_SERVER . "/resources/blank.png?{}', '')");
 $db->execute("INSERT INTO MEDIA_SOURCE VALUES ('1342634200',
-    'Yahoo News', 'rss', 'http://news.yahoo.com/rss/',
+    'Yahoo News', 'rss', 'https://news.yahoo.com/rss/',
     '//content/@url', 'en-US')");
 $db->execute("INSERT INTO CRAWL_MIXES VALUES (2, 'images', ".ROOT_ID.", -1)");
 $db->execute("INSERT INTO MIX_FRAGMENTS VALUES(2, 0, 1)");
diff --git a/src/configs/default_crawl.ini b/src/configs/default_crawl.ini
index 1e965e531..f99f28ba0 100644
--- a/src/configs/default_crawl.ini
+++ b/src/configs/default_crawl.ini
@@ -22,13 +22,13 @@
 ;
 [general]
 crawl_order = 'ad';
-summarizer_option = 'dl';
+summarizer_option = 'dk';
 crawl_type = 'ax';
 crawl_index = '1483056689';
 arc_dir = "";
 arc_type = "";
 page_recrawl_frequency = '-1';
-page_range_request = '50000';
+page_range_request = '100000';
 max_description_len = '10000';
 cache_pages = true;
 restrict_sites_by_url = false;
diff --git a/src/controllers/Controller.php b/src/controllers/Controller.php
index 874d35f91..27e8bf1c6 100755
--- a/src/controllers/Controller.php
+++ b/src/controllers/Controller.php
@@ -378,7 +378,7 @@ abstract class Controller
      *  Method to perform a 302 redirect to $location in both under web server
      *  and CLI setting
      *
-     *  @param string $location url to redirect to
+     * @param string $location url to redirect to
      */
     public function redirectLocation($location)
     {
diff --git a/src/data/public_default.db b/src/data/public_default.db
index 454d377f6..ad9f2d368 100644
Binary files a/src/data/public_default.db and b/src/data/public_default.db differ
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index cc87316cc..692b671cd 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -308,6 +308,7 @@ class QueueServer implements CrawlConstants, Join
         $this->indexing_plugins = [];
         $this->indexing_plugins_data = [];
         $this->video_sources = [];
+        $this->waiting_hosts = [];
         $this->server_name = "IndexerAndScheduler";
         $this->process_name = "QueueServer";
     }
diff --git a/src/library/FetchUrl.php b/src/library/FetchUrl.php
index a6fbaf903..e249a3dc4 100755
--- a/src/library/FetchUrl.php
+++ b/src/library/FetchUrl.php
@@ -74,15 +74,7 @@ class FetchUrl implements CrawlConstants
         $post_data = null, $follow = false, $tor_proxy = "",
         $proxy_servers=[])
     {
-        static $agent_handler = null;
-        static $handler_time = 0;
-        if (empty($agent_handler)) {
-            /* try to keep handler around between calls to allow for connection
-                reuse
-             */
-            $agent_handler = curl_multi_init();
-            $handler_time = microtime(true);
-        }
+        $agent_handler = curl_multi_init();
         $active = null;
         $start_time = microtime(true);
         if (!$minimal && $temp_dir == null) {
@@ -420,11 +412,8 @@ class FetchUrl implements CrawlConstants
             crawlLog("  Get Page Content time ".
                 (changeInMicrotime($start_time)));
         }
-        if (microtime(true) - $handler_time > C\PAGE_TIMEOUT) {
-            if (!empty($agent_handler)) {
-                curl_multi_close($agent_handler);
-            }
-            $agent_handler = null;
+        if (!empty($agent_handler)) {
+            curl_multi_close($agent_handler);
         }
         return $sites;
     }
ViewGit