Switch a couple places to file_put_contents over filePutContents so append works, get rid of unnecessary serialize pruneLinks, a=chris
Switch a couple places to file_put_contents over filePutContents so append works, get rid of unnecessary serialize pruneLinks, a=chris
diff --git a/src/controllers/MachineController.php b/src/controllers/MachineController.php
index 6eb696f57..45ea39e2a 100644
--- a/src/controllers/MachineController.php
+++ b/src/controllers/MachineController.php
@@ -161,10 +161,12 @@ class MachineController extends Controller implements CrawlConstants
}
if (!file_exists($error_log) || filesize($error_log) >
C\MAX_LOG_FILE_SIZE) {
- $this->web_site->filePutContents($error_log, $out_msg);
+ /* use file_put_contents as filePutContetns doesn't
+ support FILE_APPEND
+ */
+ file_put_contents($error_log, $out_msg);
} else {
- $this->web_site->filePutContents($error_log, $out_msg,
- FILE_APPEND);
+ file_put_contents($error_log, $out_msg, FILE_APPEND);
}
CrawlDaemon::start("Fetcher", "$id-$channel", "$channel");
break;
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 9750c4475..a5e84143c 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -2152,8 +2152,8 @@ class Fetcher implements CrawlConstants
* Page processors are allowed to extract up to MAX_LINKS_TO_EXTRACT
* This method attempts to cull from the doc_info struct the
* best MAX_LINKS_PER_PAGE. Currently, this is done by first removing
- * links which of filetype or sites the crawler is forbidden from crawl.
- * Then a crude estimate of the informaation contained in the links test:
+ * links of filetype or sites the crawler is forbidden from crawl.
+ * Then a crude estimate of the information contained in the links test:
* strlen(gzip(text)) is used to extract the best remaining links.
*
* @param array& $doc_info a string with a CrawlConstants::LINKS subarray
diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php
index 3d98a970f..529e9137b 100755
--- a/src/library/UrlParser.php
+++ b/src/library/UrlParser.php
@@ -905,9 +905,8 @@ class UrlParser
}
$info_link = [];
// choose the MAX_LINKS_PER_PAGE many pages with most info (crude)
- foreach ($links as $url => $info) {
- $num_terms = count(preg_split("/\s+|\-|\_|\~/", $info));
- $text = serialize($info);
+ foreach ($links as $url => $text) {
+ $num_terms = count(preg_split("/\s+|\-|\_|\~/", $text));
$len_text = strlen($text) + 1;
$compressed_len = strlen(gzcompress($text)) + 1;
$effective_num_terms = $num_terms * ($compressed_len/$len_text);