diff --git a/src/configs/Config.php b/src/configs/Config.php index 395323608..f6a2eb0cc 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -608,7 +608,7 @@ nsconddefine('CACHE_ROBOT_TXT_TIME', ONE_DAY); * to speed up checking if a url is okay to crawl. All robots.txt * files are kept on disk, but might be slower to access if not in cache. */ -nsconddefine('SIZE_ROBOT_TXT_CACHE', 2000); +nsconddefine('SIZE_ROBOT_TXT_CACHE', 1000); /** * Whether the scheduler should track ETag and Expires headers. * If you want to turn this off set the variable to false in diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index d62360916..e5e1ff760 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -1550,10 +1550,12 @@ class Fetcher implements CrawlConstants if we will give a page a second try if it doesn't download the first time */ - if (UrlParser::getDocumentFilename($seeds[$i][self::URL]). - ".".UrlParser::getDocumentType($seeds[$i][self::URL]) + if (UrlParser::getDocumentFilename($seeds[$i][self::URL]) . + "." . UrlParser::getDocumentType($seeds[$i][self::URL]) == "robots.txt") { - $seeds[$i][self::ROBOT_PATHS] = []; + $seeds[$i][self::ROBOT_PATHS] = [ + self::ALLOWED_SITES => [], + self::DISALLOWED_SITES => []]; } $i++; } @@ -1607,10 +1609,8 @@ class Fetcher implements CrawlConstants public function reschedulePages(&$site_pages) { $start_time = microtime(true); - $downloaded = []; $not_downloaded = []; - foreach ($site_pages as $site) { if ( (isset($site[self::ROBOT_PATHS]) || isset($site[self::PAGE])) && (is_numeric($site[self::HTTP_CODE] ) && @@ -1679,11 +1679,12 @@ class Fetcher implements CrawlConstants $type = $site[self::TYPE]; if ($response_code != 404) { /* - disallow crawling if robots.txt was any error other + disallow crawling if robots.txt has any error other that not found */ $was_robot_error = true; - $site[self::ROBOT_PATHS][] = "/"; + $site[self::ROBOT_PATHS][ + self::DISALLOWED_SITES][] = "/"; } } } else if (isset($site[self::FILE_NAME])) { diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php index e7a8c6008..623884b49 100755 --- a/src/library/PhraseParser.php +++ b/src/library/PhraseParser.php @@ -756,7 +756,9 @@ class PhraseParser */ public static function segmentSegment($segment, $lang) { - if ($segment == "") { return [];} + if ($segment == "") { + return []; + } $term_string = ""; if ($lang != null) { $segment_obj = self::getTokenizer($lang); @@ -1209,7 +1211,7 @@ class PhraseParser $char_added = ""; $word_guess = ""; $was_space = true; - while($cur_pos >= 0) { + while($cur_pos > 0) { $cur_pos--; $char_added = mb_substr($segment, $cur_pos, 1); $is_space = trim($char_added) == ""; @@ -1220,7 +1222,7 @@ class PhraseParser $one_word = self::oneWord($word_guess, $locale, $additional_regexes); if ($one_word) { - $out_segment .= " ".strrev($word_guess); + $out_segment .= " " . strrev($word_guess); $word_guess = ""; } else { $out_segment .= " ". strrev(mb_substr($word_guess, 1)); @@ -1244,11 +1246,11 @@ class PhraseParser } if (!$is_suffix) { if (mb_strlen($word_guess) > 1 && - !self::oneWord($word_guess, $locale, $additional_regexes)){ + !self::oneWord($word_guess, $locale, $additional_regexes)) { $out_segment .= " ".strrev(mb_substr($word_guess, 1)); $word_guess = $char_added; } else { - $out_segment .= " ".strrev($word_guess); + $out_segment .= " " . strrev($word_guess); $word_guess = ""; } $was_space = false; diff --git a/src/library/VersionManager.php b/src/library/VersionManager.php index 57d4abd1e..7e1f9d435 100644 --- a/src/library/VersionManager.php +++ b/src/library/VersionManager.php @@ -147,7 +147,7 @@ class VersionManager * folder will be made. If $file_changed is a nonexistent file in $folder * then the dir's in path to $file_changed will be updated. * - * @param string $file_changed + * @param string $file_changed * @param string $folder * @param int $now * @param bool $lock whether or not a lock should be obtained before @@ -965,7 +965,9 @@ class VersionManager if (is_file($file_or_dir) || is_link($file_or_dir)) { unlink($file_or_dir); } else { - rmdir($file_or_dir); + if (count(scandir($file_or_dir)) == 2) { + rmdir($file_or_dir); + } } } return self::SUCCESS; diff --git a/src/library/WebArchive.php b/src/library/WebArchive.php index f4aefc915..96099cdc3 100755 --- a/src/library/WebArchive.php +++ b/src/library/WebArchive.php @@ -324,7 +324,6 @@ class WebArchive if ($is_string) { $storage_len = strlen($this->storage); } - set_error_handler(null); if ((!$is_string && fseek($fh, $offset) == 0 ) || ($is_string && $offset < $storage_len)) { for ($i = 0; $i < $num; $i++) { @@ -353,7 +352,6 @@ class WebArchive $this->iterator_pos = $offset; } } - set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); if ($open_flag) { $this->close($fh); } diff --git a/src/library/WebQueueBundle.php b/src/library/WebQueueBundle.php index 76dbf79f0..d2fb724ec 100755 --- a/src/library/WebQueueBundle.php +++ b/src/library/WebQueueBundle.php @@ -594,14 +594,19 @@ class WebQueueBundle implements Notifier } $robot_paths = (isset($robot_object[0][1])) ? $robot_object[0][1] : []; //these should have been urldecoded in RobotProcessor - $robots_okay = true; - $robots_not_okay = false; - if (isset($robot_paths[CrawlConstants::DISALLOWED_SITES])) { + if (empty($robot_paths)) { + $robots_okay = false; + $robots_not_okay = true; + } else { + $robots_okay = true; + $robots_not_okay = false; + } + if (!empty($robot_paths[CrawlConstants::DISALLOWED_SITES])) { $robots_not_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::DISALLOWED_SITES]); $robots_okay = !$robots_not_okay; } - if (isset($robot_paths[CrawlConstants::ALLOWED_SITES])) { + if (!empty($robot_paths[CrawlConstants::ALLOWED_SITES])) { $robots_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::ALLOWED_SITES]); } diff --git a/src/library/processors/RobotProcessor.php b/src/library/processors/RobotProcessor.php index d5f6f947a..5e74af1c7 100644 --- a/src/library/processors/RobotProcessor.php +++ b/src/library/processors/RobotProcessor.php @@ -81,7 +81,8 @@ class RobotProcessor extends PageProcessor $summary[self::TITLE] = ""; $summary[self::DESCRIPTION] = ""; $summary[self::LANG] = null; - $summary[self::ROBOT_PATHS] = []; + $summary[self::ROBOT_PATHS] = [self::ALLOWED_SITES => [], + self::DISALLOWED_SITES => []]; $summary[self::AGENT_LIST] = []; $summary[self::LINKS] = []; $host_url = UrlParser::getHost($url); @@ -112,12 +113,14 @@ class RobotProcessor extends PageProcessor $summary[self::AGENT_LIST][] = $value; $current_specificness = (strcmp($value, C\USER_AGENT_SHORT) == 0) ? 1 : 0; - if ($current_specificness < $specificness) break; + if ($current_specificness < $specificness) { break; } if ($specificness < $current_specificness) { //Give precedence to exact match on agent string $specificness = $current_specificness; $add_rule_state = true; - $summary[self::ROBOT_PATHS] = []; + $summary[self::ROBOT_PATHS] = + [self::ALLOWED_SITES => [], + self::DISALLOWED_SITES => []]; break; } $agent_parts = explode("*", $value); diff --git a/tests/PhraseParserTest.php b/tests/PhraseParserTest.php index cada8181f..4f9e520f6 100644 --- a/tests/PhraseParserTest.php +++ b/tests/PhraseParserTest.php @@ -236,10 +236,10 @@ EOD; public function computeCharGramTestCase() { $n_grams = PhraseParser::getNGramsTerm(["orange"], 6); - $this->assertTrue(count(($n_grams) == 1), "NGram Test 1"); + $this->assertTrue((count($n_grams) == 1), "NGram Test 1"); $three_grams = ['ora', 'ran', 'ang', 'nge']; $n_grams = PhraseParser::getNGramsTerm(["orange"], 3); - $this->assertTrue(count(($three_grams) == 4), "NGram Test 2"); + $this->assertTrue((count($three_grams) == 4), "NGram Test 2"); $this->assertTrue(array_diff($three_grams, $n_grams) == [], "NGram Test 3"); $n_grams = PhraseParser::getCharGramsTerm(["orange"], 'en-US'); @@ -253,12 +253,12 @@ EOD; { $segments = PhraseParser::segmentSegment("你们好吗", 'zh-CN'); $correct_segments = ["你们", "好", "吗"]; - $this->assertTrue(count(($segments) == 3), "Segmenter Test 1"); + $this->assertTrue((count($segments) == 3), "Segmenter Test 1"); $this->assertTrue(array_diff($segments, $correct_segments) == [], "Segmenter Test 2"); $segments = PhraseParser::segmentSegment("你们好吗?", 'zh-CN'); $correct_segments = ["你们", "好", "吗", "?"]; - $this->assertTrue(count(($segments) == 3), "Segmenter Test 3"); + $this->assertTrue((count($segments) == 4), "Segmenter Test 3"); $this->assertTrue(array_diff($segments, $correct_segments) == [], "Segmenter Test 4"); }