Switch a couple places to file_put_contents over filePutContents so append works, get rid of unnecessary serialize pruneLinks, a=chris

Chris Pollett [2019-06-29 21:Jun:th]

Switch a couple places to file_put_contents over filePutContents so append works, get rid of unnecessary serialize pruneLinks, a=chris

Filename
src/controllers/MachineController.php
src/executables/Fetcher.php
src/library/UrlParser.php

diff --git a/src/controllers/MachineController.php b/src/controllers/MachineController.php
index 6eb696f57..45ea39e2a 100644
--- a/src/controllers/MachineController.php
+++ b/src/controllers/MachineController.php
@@ -161,10 +161,12 @@ class MachineController extends Controller implements CrawlConstants
                 }
                 if (!file_exists($error_log) || filesize($error_log) >
                     C\MAX_LOG_FILE_SIZE) {
-                    $this->web_site->filePutContents($error_log, $out_msg);
+                    /* use file_put_contents as filePutContetns doesn't
+                       support FILE_APPEND
+                     */
+                    file_put_contents($error_log, $out_msg);
                 } else {
-                    $this->web_site->filePutContents($error_log, $out_msg,
-                        FILE_APPEND);
+                    file_put_contents($error_log, $out_msg, FILE_APPEND);
                 }
                 CrawlDaemon::start("Fetcher", "$id-$channel", "$channel");
                 break;
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 9750c4475..a5e84143c 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -2152,8 +2152,8 @@ class Fetcher implements CrawlConstants
      * Page processors are allowed to extract up to MAX_LINKS_TO_EXTRACT
      * This method attempts to cull from the doc_info struct the
      * best MAX_LINKS_PER_PAGE. Currently, this is done by first removing
-     * links which of filetype or sites the crawler is forbidden from crawl.
-     * Then a crude estimate of the informaation contained in the links test:
+     * links of filetype or sites the crawler is forbidden from crawl.
+     * Then a crude estimate of the information contained in the links test:
      * strlen(gzip(text)) is used to extract the best remaining links.
      *
      * @param array& $doc_info a string with a CrawlConstants::LINKS subarray
diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php
index 3d98a970f..529e9137b 100755
--- a/src/library/UrlParser.php
+++ b/src/library/UrlParser.php
@@ -905,9 +905,8 @@ class UrlParser
         }
         $info_link = [];
         // choose the MAX_LINKS_PER_PAGE many pages with most info (crude)
-        foreach ($links as $url => $info) {
-            $num_terms = count(preg_split("/\s+|\-|\_|\~/", $info));
-            $text = serialize($info);
+        foreach ($links as $url => $text) {
+            $num_terms = count(preg_split("/\s+|\-|\_|\~/", $text));
             $len_text = strlen($text) + 1;
             $compressed_len = strlen(gzcompress($text)) + 1;
             $effective_num_terms = $num_terms * ($compressed_len/$len_text);

ViewGit