diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 2c383b466..04bf85727 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -401,7 +401,6 @@ class QueueServer implements CrawlConstants, Join
$remove = true;
}
}
-
if ($remove == true) {
L\crawlLog("Remove old messages..", $this->process_name);
}
@@ -736,7 +735,7 @@ class QueueServer implements CrawlConstants, Join
$count = $this->web_queue->to_crawl_queue->count;
$max_links = max(C\MAX_LINKS_PER_PAGE, C\MAX_LINKS_PER_SITEMAP);
if ($count < C\NUM_URLS_QUEUE_RAM -
- C\SEEN_URLS_BEFORE_UPDATE_SCHEDULER * $max_links){
+ C\SEEN_URLS_BEFORE_UPDATE_SCHEDULER * $max_links) {
$info = $this->processQueueUrls();
}
if ($count > 0) {
@@ -2526,12 +2525,12 @@ class QueueServer implements CrawlConstants, Join
$next_slot = $this->getEarliestSlot($current_crawl_index,
$sites);
if ($next_slot < C\MAX_FETCH_SIZE) {
- $sites[$next_slot] = [$url, $weight, 0];
- $delete_urls[$i] = $url;
/* note don't add to seen url filter
since check robots every 24 hours as needed
*/
+ $sites[$next_slot] = [$url, $weight, 0];
$current_crawl_index = $next_slot;
+ $delete_urls[$i] = $url;
$fetch_size++;
$i++;
} else { //no more available slots so prepare to bail
@@ -2546,114 +2545,124 @@ class QueueServer implements CrawlConstants, Join
}
//Now handle the non-robots.txt url case
$robots_okay = true;
- if ($has_robots) {
- if ($no_flags) {
- if ($this->robots_txt == C\IGNORE_ROBOTS ||
- ($this->robots_txt == C\ALLOW_LANDING_ROBOTS &&
- rtrim($url, "/") == rtrim($host_url, "/"))) {
- $robots_okay = true;
- } else if (!isset($hard_coded) || !$hard_coded) {
- $robots_okay = $this->web_queue->checkRobotOkay($url);
+ if (!$has_robots) {
+ $i++;
+ continue;
+ }
+ if ($no_flags) {
+ if ($this->robots_txt == C\IGNORE_ROBOTS ||
+ ($this->robots_txt == C\ALLOW_LANDING_ROBOTS &&
+ rtrim($url, "/") == rtrim($host_url, "/"))) {
+ $robots_okay = true;
+ } else if (!isset($hard_coded) || !$hard_coded) {
+ $robots_okay = $this->web_queue->checkRobotOkay($url);
+ } else {
+ $robots_okay = true;
+ }
+ if (!$this->allowedToCrawlSite($url) ||
+ $this->disallowedToCrawlSite($url)) {
+ /* This is checked when added to queue,
+ we check again here in case allowed and disallowed
+ sites have changed since then
+ */
+ $robots_okay = false;
+ }
+ if (!$robots_okay) {
+ $delete_urls[$i] = $url;
+ $this->web_queue->addSeenUrlFilter($url);
+ $i++;
+ continue;
+ }
+ $delay = $this->web_queue->getCrawlDelay($host_url);
+ }
+ if (!$this->withinQuota($url)) {
+ //we've not allowed to schedule $url till next hour
+ $delete_urls[$i] = $url;
+ //delete from queue (so no clog) but don't mark seen
+ $i++;
+ continue;
+ }
+ //each host has two entries in $this->waiting_hosts
+ $num_waiting = floor(count($this->waiting_hosts)/2);
+ if ($delay > 0) {
+ // handle adding a url if there is a crawl delay
+ $hash_host = L\crawlHash($host_url);
+ $is_waiting_host = isset($this->waiting_hosts[$hash_host]);
+ /*
+ To ensure that crawl-delay isn't violated by two separate
+ fetchers crawling the same host, if a host has a crawl
+ delay we only let it appear in one outstanding schedule
+ at a time. When data appears back from the fetcher handling
+ a crawl-delayed host, we'll clear it to appear in another
+ schedule
+ */
+ if ((!$is_waiting_host
+ && $num_waiting < C\MAX_WAITING_HOSTS) ||
+ $is_waiting_host && $this->waiting_hosts[$hash_host] ==
+ $schedule_time) {
+ $this->waiting_hosts[$hash_host] =
+ $schedule_time;
+ $this->waiting_hosts[$schedule_time][] =
+ $hash_host;
+ $request_batches_per_delay =
+ ceil($delay/$time_per_request_guess);
+ if (!isset($crawl_delay_hosts[$hash_host])) {
+ $next_earliest_slot = $current_crawl_index;
+ $crawl_delay_hosts[$hash_host] = $next_earliest_slot;
} else {
- $robots_okay = true;
- }
- if (!$this->allowedToCrawlSite($url) ||
- $this->disallowedToCrawlSite($url)) {
- /* This is checked when added to queue,
- we check again here in case allowed and disallowed
- sites have changed since then
- */
- $robots_okay = false;
+ $next_earliest_slot = $crawl_delay_hosts[$hash_host]
+ + $request_batches_per_delay
+ * C\NUM_MULTI_CURL_PAGES;
}
- if (!$robots_okay) {
+ if (($next_slot =
+ $this->getEarliestSlot($next_earliest_slot,
+ $sites)) < C\MAX_FETCH_SIZE) {
+ $crawl_delay_hosts[$hash_host] = $next_slot;
$delete_urls[$i] = $url;
+ $sites[$next_slot] = [$url, $weight, $delay];
$this->web_queue->addSeenUrlFilter($url);
- $i++;
- continue;
+ /* we might miss some sites by marking them
+ seen after only scheduling them
+ */
+ $fetch_size++;
+ } else if ($no_flags) {
+ $this->web_queue->setQueueFlag($url,
+ $delay + WebQueueBundle::SCHEDULABLE);
}
- $delay = $this->web_queue->getCrawlDelay($host_url);
- }
- if (!$this->withinQuota($url)) {
- //we've not allowed to schedule $url till next hour
+ } else if (!$is_waiting_host) {
+ // has crawl delay but too many already waiting
$delete_urls[$i] = $url;
//delete from queue (so no clog) but don't mark seen
$i++;
continue;
}
- //each host has two entries in $this->waiting_hosts
- $num_waiting = floor(count($this->waiting_hosts)/2);
- if ($delay > 0 ) {
- // handle adding a url if there is a crawl delay
- $hash_host = L\crawlHash($host_url);
- $is_waiting_host = isset($this->waiting_hosts[$hash_host]);
- if ((!$is_waiting_host
- && $num_waiting < C\MAX_WAITING_HOSTS) ||
- $is_waiting_host && $this->waiting_hosts[$hash_host] ==
- $schedule_time) {
- $this->waiting_hosts[$hash_host] =
- $schedule_time;
- $this->waiting_hosts[$schedule_time][] =
- $hash_host;
- $request_batches_per_delay =
- ceil($delay/$time_per_request_guess);
- if (!isset($crawl_delay_hosts[$hash_host])) {
- $next_earliest_slot = $current_crawl_index;
- $crawl_delay_hosts[$hash_host]= $next_earliest_slot;
- } else {
- $next_earliest_slot = $crawl_delay_hosts[$hash_host]
- + $request_batches_per_delay
- * C\NUM_MULTI_CURL_PAGES;
- }
- if (($next_slot =
- $this->getEarliestSlot( $next_earliest_slot,
- $sites)) < C\MAX_FETCH_SIZE) {
- $crawl_delay_hosts[$hash_host] = $next_slot;
- $delete_urls[$i] = $url;
- $sites[$next_slot] = [$url, $weight, $delay];
- $this->web_queue->addSeenUrlFilter($url);
- /* we might miss some sites by marking them
- seen after only scheduling them
- */
- $fetch_size++;
- } else if ($no_flags) {
- $this->web_queue->setQueueFlag($url,
- $delay + WebQueueBundle::SCHEDULABLE);
- }
- } else if (!$is_waiting_host) {
- // has crawl delay but too many already waiting
- $delete_urls[$i] = $url;
- //delete from queue (so no clog) but don't mark seen
- $i++;
- continue;
- }
- } else { // add a url no crawl delay
- $next_slot = $this->getEarliestSlot(
- $current_crawl_index, $sites);
- if ($next_slot < C\MAX_FETCH_SIZE) {
- $sites[$next_slot] = [$url, $weight, 0];
- $delete_urls[$i] = $url;
- $this->web_queue->addSeenUrlFilter($url);
- /* we might miss some sites by marking them
- seen after only scheduling them
- */
- $current_crawl_index = $next_slot;
- $fetch_size++;
- } else { //no more available slots so prepare to bail
- $i = $count;
- if ($no_flags) {
- $this->web_queue->setQueueFlag($url,
- WebQueueBundle::SCHEDULABLE);
- }
+ } else { // add a url no crawl delay
+ $next_slot = $this->getEarliestSlot($current_crawl_index,
+ $sites);
+ if ($next_slot < C\MAX_FETCH_SIZE) {
+ $sites[$next_slot] = [$url, $weight, 0];
+ $delete_urls[$i] = $url;
+ $this->web_queue->addSeenUrlFilter($url);
+ /* we might miss some sites by marking them
+ seen after only scheduling them
+ */
+ $current_crawl_index = $next_slot;
+ $fetch_size++;
+ } else { //no more available slots so prepare to bail
+ $i = $count;
+ if ($no_flags) {
+ $this->web_queue->setQueueFlag($url,
+ WebQueueBundle::SCHEDULABLE);
}
- } //if delay else
- } // if containsGotRobotTxt
- // handle robots.txt urls
+ }
+ } //no crawl-delay else
$i++;
} //end while
$this->web_queue->closeUrlArchive($fh);
$new_time = microtime(true);
L\crawlLog("...Scheduler: Done selecting URLS for fetch batch time ".
"so far:". L\changeInMicrotime($start_time));
+ L\crawlLog("...Scheduler: Examined urls while making fetch batch: $i");
$num_deletes = count($delete_urls);
$k = 0;
foreach ($delete_urls as $delete_url) {
@@ -2669,7 +2678,7 @@ class QueueServer implements CrawlConstants, Join
}
}
L\crawlLog("...Scheduler: Removed $k URLS for fetch batch from ".
- "queue in time: ".L\changeInMicrotime($new_time));
+ "queue in time: " . L\changeInMicrotime($new_time));
$new_time = microtime(true);
if (isset($sites) && count($sites) > 0 ) {
$dummy_slot = [self::DUMMY, 0.0, 0];
@@ -2693,7 +2702,7 @@ class QueueServer implements CrawlConstants, Join
//write schedule to disk
$fh = fopen(C\CRAWL_DIR.
"/schedules/".
- self::schedule_name.$this->crawl_time.".txt", "wb");
+ self::schedule_name.$this->crawl_time . ".txt", "wb");
fwrite($fh, $first_line);
$num_sites = count($sites);
$k = 0;
diff --git a/src/library/MailServer.php b/src/library/MailServer.php
index e0b5783bc..c3fc121b4 100644
--- a/src/library/MailServer.php
+++ b/src/library/MailServer.php
@@ -37,7 +37,7 @@ use seekquarry\yioop\library\MediaConstants;
/**
* Timing functions
*/
-require_once __DIR__."/Utility.php";
+require_once __DIR__ . "/Utility.php";
/**
* A small class for communicating with an SMTP server. Used to avoid
* configuration issues that might be needed with PHP's built-in mail()
@@ -144,7 +144,7 @@ class MailServer implements MediaConstants
"dev.null";
$this->server = $server;
if ($secure == "ssl") {
- 'ssl://'.$server;
+ 'ssl://' . $server;
}
$this->port = $port;
$this->login = $login;
@@ -239,7 +239,7 @@ class MailServer implements MediaConstants
return $this->readResponseGetCode();
}
/**
- * Sends (or queues for media updater)an email
+ * Sends (or queues for media updater) an email
* (much like PHP's mail command, but not requiring
* a configured smtp server on the current machine)
*
@@ -340,14 +340,14 @@ class MailServer implements MediaConstants
webExit();
}
}
- $files = glob($mail_directory."/*.txt");
+ $files = glob($mail_directory . "/*.txt");
$file_count = count($files);
$current_count = 0;
$current_time = time();
$diff = 0;
if ($file_count > 0) {
$file = end($files);
- $file_name = str_replace($mail_directory."/", "", $file);
+ $file_name = str_replace($mail_directory . "/", "", $file);
$last_file_time = substr($file_name, 0, -4);
$diff = $current_time - $last_file_time;
}
@@ -371,7 +371,7 @@ class MailServer implements MediaConstants
" for $file_time.txt!\n");
}
} else {
- $fp = fopen($mail_directory."/".$last_file_time.".txt", "a+");
+ $fp = fopen($mail_directory . "/" . $last_file_time . ".txt", "a+");
if (flock($fp, LOCK_EX | LOCK_NB)) {
crawlLog("....Lock acquired! Sending emails now!\n");
fwrite($fp, $mail_details);
diff --git a/src/library/media_jobs/BulkEmailJob.php b/src/library/media_jobs/BulkEmailJob.php
index c3511e8dc..b5dc6a79a 100644
--- a/src/library/media_jobs/BulkEmailJob.php
+++ b/src/library/media_jobs/BulkEmailJob.php
@@ -94,7 +94,7 @@ class BulkEmailJob extends MediaJob
if (!$sendable_file) {
return;
}
- L\crawlLog("Using Mail Directory:". $mail_directory);
+ L\crawlLog("Using Mail Directory:" . $mail_directory);
$emails_string = file_get_contents($sendable_file);
unlink($sendable_file);
$emails = explode(self::MESSAGE_SEPARATOR, $emails_string);
@@ -150,7 +150,7 @@ class BulkEmailJob extends MediaJob
if (!file_exists($mail_directory)) {
return false;
}
- $files = glob($mail_directory."/*.txt");
+ $files = glob($mail_directory . "/*.txt");
$sendable_file = false;
foreach ($files as $email_file) {
if (time() - filemtime($email_file) >
diff --git a/src/scripts/suggest.js b/src/scripts/suggest.js
index 52b2ee2ac..684135a44 100644
--- a/src/scripts/suggest.js
+++ b/src/scripts/suggest.js
@@ -733,7 +733,7 @@ function spellCheck()
if (query.length > MIN_SPELL_CHECK_WIDTH) {
return;
}
- if (corrected_query.trim() != query) {
+ if (corrected_query.trim() != query.toLowerCase()) {
if (logged_in) {
var token_name = csrf_name;
var spell_link = "?" + token_name + "=" + csrf_token + "&q="