Fix some unit tests that were messed up, lower size of robot txt cache, improve handling if robot archive gets garbled, a=chris

Chris Pollett [2018-04-21 06:Apr:st]

Fix some unit tests that were messed up, lower size of robot txt cache, improve handling if robot archive gets garbled, a=chris

Filename
src/configs/Config.php
src/executables/Fetcher.php
src/library/PhraseParser.php
src/library/VersionManager.php
src/library/WebArchive.php
src/library/WebQueueBundle.php
src/library/processors/RobotProcessor.php
tests/PhraseParserTest.php

diff --git a/src/configs/Config.php b/src/configs/Config.php
index 395323608..f6a2eb0cc 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -608,7 +608,7 @@ nsconddefine('CACHE_ROBOT_TXT_TIME', ONE_DAY);
  * to speed up checking if a url is okay to crawl. All robots.txt
  * files are kept on disk, but might be slower to access if not in cache.
  */
-nsconddefine('SIZE_ROBOT_TXT_CACHE', 2000);
+nsconddefine('SIZE_ROBOT_TXT_CACHE', 1000);
 /**
  * Whether the scheduler should track ETag and Expires headers.
  * If you want to turn this off set the variable to false in
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index d62360916..e5e1ff760 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -1550,10 +1550,12 @@ class Fetcher implements CrawlConstants
                       if we will give a page a second try if it doesn't
                       download the first time
                     */
-                    if (UrlParser::getDocumentFilename($seeds[$i][self::URL]).
-                        ".".UrlParser::getDocumentType($seeds[$i][self::URL])
+                    if (UrlParser::getDocumentFilename($seeds[$i][self::URL]) .
+                        "." . UrlParser::getDocumentType($seeds[$i][self::URL])
                         == "robots.txt") {
-                        $seeds[$i][self::ROBOT_PATHS] = [];
+                        $seeds[$i][self::ROBOT_PATHS] = [
+                            self::ALLOWED_SITES => [],
+                            self::DISALLOWED_SITES => []];
                     }
                     $i++;
                 }
@@ -1607,10 +1609,8 @@ class Fetcher implements CrawlConstants
     public function reschedulePages(&$site_pages)
     {
         $start_time = microtime(true);
-
         $downloaded = [];
         $not_downloaded = [];
-
         foreach ($site_pages as $site) {
             if ( (isset($site[self::ROBOT_PATHS]) || isset($site[self::PAGE]))
                 && (is_numeric($site[self::HTTP_CODE] ) &&
@@ -1679,11 +1679,12 @@ class Fetcher implements CrawlConstants
                     $type = $site[self::TYPE];
                     if ($response_code != 404) {
                         /*
-                            disallow crawling if robots.txt was any error other
+                            disallow crawling if robots.txt has any error other
                             that not found
                          */
                         $was_robot_error = true;
-                        $site[self::ROBOT_PATHS][] = "/";
+                        $site[self::ROBOT_PATHS][
+                            self::DISALLOWED_SITES][] = "/";
                      }
                 }
             } else if (isset($site[self::FILE_NAME])) {
diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index e7a8c6008..623884b49 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -756,7 +756,9 @@ class PhraseParser
      */
     public static function segmentSegment($segment, $lang)
     {
-        if ($segment == "") { return [];}
+        if ($segment == "") {
+            return [];
+        }
         $term_string = "";
         if ($lang != null) {
             $segment_obj = self::getTokenizer($lang);
@@ -1209,7 +1211,7 @@ class PhraseParser
         $char_added = "";
         $word_guess = "";
         $was_space = true;
-        while($cur_pos >= 0) {
+        while($cur_pos > 0) {
             $cur_pos--;
             $char_added =  mb_substr($segment, $cur_pos, 1);
             $is_space = trim($char_added) == "";
@@ -1220,7 +1222,7 @@ class PhraseParser
                 $one_word = self::oneWord($word_guess, $locale,
                     $additional_regexes);
                 if ($one_word) {
-                    $out_segment .= " ".strrev($word_guess);
+                    $out_segment .= " " . strrev($word_guess);
                     $word_guess = "";
                 } else {
                     $out_segment .= " ". strrev(mb_substr($word_guess, 1));
@@ -1244,11 +1246,11 @@ class PhraseParser
             }
             if (!$is_suffix) {
                 if (mb_strlen($word_guess) > 1 &&
-                    !self::oneWord($word_guess, $locale, $additional_regexes)){
+                    !self::oneWord($word_guess, $locale, $additional_regexes)) {
                     $out_segment .= " ".strrev(mb_substr($word_guess, 1));
                     $word_guess = $char_added;
                 } else {
-                    $out_segment .= " ".strrev($word_guess);
+                    $out_segment .= " " . strrev($word_guess);
                     $word_guess = "";
                 }
                 $was_space = false;
diff --git a/src/library/VersionManager.php b/src/library/VersionManager.php
index 57d4abd1e..7e1f9d435 100644
--- a/src/library/VersionManager.php
+++ b/src/library/VersionManager.php
@@ -147,7 +147,7 @@ class VersionManager
      * folder will be made. If $file_changed is a nonexistent file in $folder
      * then the dir's in path to $file_changed will be updated.
      *
-     * @param string $file_changed
+     * @param string $file_changed
      * @param string $folder
      * @param int $now
      * @param bool $lock whether or not a lock should be obtained before
@@ -965,7 +965,9 @@ class VersionManager
             if (is_file($file_or_dir) || is_link($file_or_dir)) {
                 unlink($file_or_dir);
             } else {
-                rmdir($file_or_dir);
+                if (count(scandir($file_or_dir)) == 2) {
+                    rmdir($file_or_dir);
+                }
             }
         }
         return self::SUCCESS;
diff --git a/src/library/WebArchive.php b/src/library/WebArchive.php
index f4aefc915..96099cdc3 100755
--- a/src/library/WebArchive.php
+++ b/src/library/WebArchive.php
@@ -324,7 +324,6 @@ class WebArchive
         if ($is_string) {
             $storage_len = strlen($this->storage);
         }
-        set_error_handler(null);
         if ((!$is_string && fseek($fh, $offset) == 0 ) || ($is_string
             && $offset < $storage_len)) {
             for ($i = 0; $i < $num; $i++) {
@@ -353,7 +352,6 @@ class WebArchive
                 $this->iterator_pos = $offset;
             }
         }
-        set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
         if ($open_flag) {
             $this->close($fh);
         }
diff --git a/src/library/WebQueueBundle.php b/src/library/WebQueueBundle.php
index 76dbf79f0..d2fb724ec 100755
--- a/src/library/WebQueueBundle.php
+++ b/src/library/WebQueueBundle.php
@@ -594,14 +594,19 @@ class WebQueueBundle implements Notifier
         }
         $robot_paths = (isset($robot_object[0][1])) ? $robot_object[0][1]
             : []; //these should have been urldecoded in RobotProcessor
-        $robots_okay = true;
-        $robots_not_okay = false;
-        if (isset($robot_paths[CrawlConstants::DISALLOWED_SITES])) {
+        if (empty($robot_paths)) {
+            $robots_okay = false;
+            $robots_not_okay = true;
+        } else {
+            $robots_okay = true;
+            $robots_not_okay = false;
+        }
+        if (!empty($robot_paths[CrawlConstants::DISALLOWED_SITES])) {
             $robots_not_okay = UrlParser::isPathMemberRegexPaths($path,
                 $robot_paths[CrawlConstants::DISALLOWED_SITES]);
             $robots_okay = !$robots_not_okay;
         }
-        if (isset($robot_paths[CrawlConstants::ALLOWED_SITES])) {
+        if (!empty($robot_paths[CrawlConstants::ALLOWED_SITES])) {
             $robots_okay = UrlParser::isPathMemberRegexPaths($path,
                 $robot_paths[CrawlConstants::ALLOWED_SITES]);
         }
diff --git a/src/library/processors/RobotProcessor.php b/src/library/processors/RobotProcessor.php
index d5f6f947a..5e74af1c7 100644
--- a/src/library/processors/RobotProcessor.php
+++ b/src/library/processors/RobotProcessor.php
@@ -81,7 +81,8 @@ class RobotProcessor extends PageProcessor
         $summary[self::TITLE] = "";
         $summary[self::DESCRIPTION] = "";
         $summary[self::LANG] = null;
-        $summary[self::ROBOT_PATHS] = [];
+        $summary[self::ROBOT_PATHS] = [self::ALLOWED_SITES => [],
+            self::DISALLOWED_SITES => []];
         $summary[self::AGENT_LIST] = [];
         $summary[self::LINKS] = [];
         $host_url = UrlParser::getHost($url);
@@ -112,12 +113,14 @@ class RobotProcessor extends PageProcessor
                     $summary[self::AGENT_LIST][] = $value;
                     $current_specificness =
                         (strcmp($value, C\USER_AGENT_SHORT) == 0) ? 1 : 0;
-                    if ($current_specificness < $specificness) break;
+                    if ($current_specificness < $specificness) { break; }
                     if ($specificness < $current_specificness) {
                         //Give precedence to exact match on agent string
                         $specificness = $current_specificness;
                         $add_rule_state = true;
-                        $summary[self::ROBOT_PATHS] = [];
+                        $summary[self::ROBOT_PATHS] =
+                            [self::ALLOWED_SITES => [],
+                            self::DISALLOWED_SITES => []];
                         break;
                     }
                     $agent_parts = explode("*", $value);
diff --git a/tests/PhraseParserTest.php b/tests/PhraseParserTest.php
index cada8181f..4f9e520f6 100644
--- a/tests/PhraseParserTest.php
+++ b/tests/PhraseParserTest.php
@@ -236,10 +236,10 @@ EOD;
     public function computeCharGramTestCase()
     {
         $n_grams = PhraseParser::getNGramsTerm(["orange"], 6);
-        $this->assertTrue(count(($n_grams) == 1), "NGram Test 1");
+        $this->assertTrue((count($n_grams) == 1), "NGram Test 1");
         $three_grams = ['ora', 'ran', 'ang', 'nge'];
         $n_grams = PhraseParser::getNGramsTerm(["orange"], 3);
-        $this->assertTrue(count(($three_grams) == 4), "NGram Test 2");
+        $this->assertTrue((count($three_grams) == 4), "NGram Test 2");
         $this->assertTrue(array_diff($three_grams, $n_grams) == [],
             "NGram Test 3");
         $n_grams = PhraseParser::getCharGramsTerm(["orange"], 'en-US');
@@ -253,12 +253,12 @@ EOD;
     {
         $segments = PhraseParser::segmentSegment("你们好吗", 'zh-CN');
         $correct_segments = ["你们", "好", "吗"];
-        $this->assertTrue(count(($segments) == 3), "Segmenter Test 1");
+        $this->assertTrue((count($segments) == 3), "Segmenter Test 1");
         $this->assertTrue(array_diff($segments, $correct_segments) == [],
             "Segmenter Test 2");
         $segments = PhraseParser::segmentSegment("你们好吗?", 'zh-CN');
         $correct_segments = ["你们", "好", "吗", "?"];
-        $this->assertTrue(count(($segments) == 3), "Segmenter Test 3");
+        $this->assertTrue((count($segments) == 4), "Segmenter Test 3");
         $this->assertTrue(array_diff($segments, $correct_segments) == [],
             "Segmenter Test 4");
     }

ViewGit