diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index c10ed9e94..c63975df1 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -239,7 +239,6 @@ class QueueServer implements CrawlConstants, Join
* @var int
*/
public $index_dirty;
-
/**
* This keeps track of the time the current archive info was last modified
* This way the queue server knows if the user has changed the crawl
@@ -285,6 +284,30 @@ class QueueServer implements CrawlConstants, Join
* @var string
*/
public $process_name;
+ /**
+ * A mapping between class field names and parameters which might
+ * be sent to a queue server via an info associative array.
+ * @var array
+ */
+ public static $info_parameter_map = [
+ "crawl_order" => self::CRAWL_ORDER,
+ "crawl_type" => self::CRAWL_TYPE,
+ "crawl_index" => self::CRAWL_INDEX,
+ "cache_pages" => self::CACHE_PAGES,
+ "page_range_request" => self::PAGE_RANGE_REQUEST,
+ "max_depth" => self::MAX_DEPTH,
+ "repeat_type" => self::REPEAT_TYPE,
+ "robots_txt" => self::ROBOTS_TXT,
+ "max_description_len" => self::MAX_DESCRIPTION_LEN,
+ "page_recrawl_frequency" => self::PAGE_RECRAWL_FREQUENCY,
+ "indexed_file_types" => self::INDEXED_FILE_TYPES,
+ "restrict_sites_by_url" => self::RESTRICT_SITES_BY_URL,
+ "allowed_sites" => self::ALLOWED_SITES,
+ "disallowed_sites" => self::DISALLOWED_SITES,
+ "page_rules" => self::PAGE_RULES,
+ "indexing_plugins" => self::INDEXING_PLUGINS,
+ "indexing_plugins_data" => self::INDEXING_PLUGINS_DATA,
+ ];
/**
* Creates a Queue Server Daemon
*/
@@ -1249,28 +1272,9 @@ class QueueServer implements CrawlConstants, Join
{
//to get here we at least have to have a crawl_time
$this->crawl_time = $info[self::CRAWL_TIME];
- $read_from_info = [
- "crawl_order" => self::CRAWL_ORDER,
- "crawl_type" => self::CRAWL_TYPE,
- "crawl_index" => self::CRAWL_INDEX,
- "cache_pages" => self::CACHE_PAGES,
- "page_range_request" => self::PAGE_RANGE_REQUEST,
- "max_depth" => self::MAX_DEPTH,
- "repeat_type" => self::REPEAT_TYPE,
- "robots_txt" => self::ROBOTS_TXT,
- "max_description_len" => self::MAX_DESCRIPTION_LEN,
- "page_recrawl_frequency" => self::PAGE_RECRAWL_FREQUENCY,
- "indexed_file_types" => self::INDEXED_FILE_TYPES,
- "restrict_sites_by_url" => self::RESTRICT_SITES_BY_URL,
- "allowed_sites" => self::ALLOWED_SITES,
- "disallowed_sites" => self::DISALLOWED_SITES,
- "page_rules" => self::PAGE_RULES,
- "indexing_plugins" => self::INDEXING_PLUGINS,
- "indexing_plugins_data" => self::INDEXING_PLUGINS_DATA,
- ];
$try_to_set_from_old_index = [];
$update_disallow = false;
- foreach ($read_from_info as $index_field => $info_field) {
+ foreach (self::$info_parameter_map as $index_field => $info_field) {
if (isset($info[$info_field])) {
if ($index_field == "disallowed_sites") {
$update_disallow = true;
@@ -1290,7 +1294,7 @@ class QueueServer implements CrawlConstants, Join
$this->updateDisallowedQuotaSites();
}
$this->initializeWebQueue();
- $this->initializeIndexBundle($info);
+ $this->initializeIndexBundle($info, $try_to_set_from_old_index);
$info[self::STATUS] = self::CONTINUE_STATE;
return $info;
}
@@ -1301,9 +1305,16 @@ class QueueServer implements CrawlConstants, Join
*
* @param array $info if initializing a new crawl this should contain
* the crawl parameters
+ * @param array $try_to_set_from_old_index parameters of the crawl
+ * to try to set from values already stored in archive info,
+ * other parameters are assumed to have been updated since.
*/
- public function initializeIndexBundle($info = [])
+ public function initializeIndexBundle($info = [],
+ $try_to_set_from_old_index = null)
{
+ if ($try_to_set_from_old_index === null) {
+ $try_to_set_from_old_index = array_key(self::$info_parameter_map);
+ }
if(empty($this->repeat_type) || $this->repeat_type < 0) {
$class_name = C\NS_LIB . "IndexArchiveBundle";
$dir = C\CRAWL_DIR . '/cache/' . self::index_data_base_name .
@@ -1318,9 +1329,10 @@ class QueueServer implements CrawlConstants, Join
$archive_info = $class_name::getArchiveInfo($dir);
$index_info = unserialize($archive_info['DESCRIPTION']);
foreach ($try_to_set_from_old_index as $index_field) {
- if (isset($index_info[$read_from_info[$index_field]]) ) {
+ if (isset($index_info[self::$info_parameter_map[$index_field]])
+ ) {
$this->$index_field =
- $index_info[$read_from_info[$index_field]];
+ $index_info[self::$info_parameter_map[$index_field]];
}
}
$archive_exists = true;
diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php
index 60f66be37..3d98a970f 100755
--- a/src/library/UrlParser.php
+++ b/src/library/UrlParser.php
@@ -906,7 +906,7 @@ class UrlParser
$info_link = [];
// choose the MAX_LINKS_PER_PAGE many pages with most info (crude)
foreach ($links as $url => $info) {
- $num_terms = count(preg_split("/\s+/", $info));
+ $num_terms = count(preg_split("/\s+|\-|\_|\~/", $info));
$text = serialize($info);
$len_text = strlen($text) + 1;
$compressed_len = strlen(gzcompress($text)) + 1;
diff --git a/src/library/processors/JpgProcessor.php b/src/library/processors/JpgProcessor.php
index d52f463e3..b8633e9a3 100755
--- a/src/library/processors/JpgProcessor.php
+++ b/src/library/processors/JpgProcessor.php
@@ -97,7 +97,7 @@ class JpgProcessor extends ImageProcessor
file_put_contents($temp_file, $page);
set_error_handler(null);
$summary[self::DESCRIPTION] = "$file_name\nEXIF DATA\n".
- print_r(exif_read_data($temp_file), true);
+ print_r(@exif_read_data($temp_file), true);
set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
} else {
$summary[self::DESCRIPTION] = $file_name;