diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php
index bc860005f..43ee95d38 100755
--- a/src/controllers/SearchController.php
+++ b/src/controllers/SearchController.php
@@ -1394,11 +1394,12 @@ class SearchController extends Controller implements CrawlConstants
if (count($instance_parts) > 1) {
$instance_num = intval($instance_parts[0]);
}
- $offset = $crawl_item[self::OFFSET];
- $cache_partition = $crawl_item[self::CACHE_PAGE_PARTITION];
- $cache_item = $crawl_model->getCacheFile($machine,
- $machine_uri, $cache_partition, $offset, $crawl_time,
- $instance_num);
+ if (!empty($crawl_item[self::OFFSET])) {
+ $cache_partition = $crawl_item[self::CACHE_PAGE_PARTITION];
+ $cache_item = $crawl_model->getCacheFile($machine,
+ $machine_uri, $cache_partition, $crawl_item[self::OFFSET],
+ $crawl_time, $instance_num);
+ }
if (!isset($cache_item[self::PAGE])) {
$data["URL"] = $url;
$data["SUMMARY_STRING"] =
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 2d5b93465..85b0c66d3 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -418,14 +418,10 @@ class QueueServer implements CrawlConstants, Join
$this->web_queue->to_crawl_queue->count);
}
$start_loop_time = time();
-
//check and update if necessary the crawl params of current crawl
$this->checkUpdateCrawlParameters();
-
$this->updateMostRecentFetcher();
-
$this->processCrawlData();
-
$time_diff = time() - $start_loop_time;
if ($time_diff < C\QUEUE_SLEEP_TIME) {
L\crawlLog("Sleeping...");
@@ -1335,7 +1331,8 @@ class QueueServer implements CrawlConstants, Join
if ($update_disallow == true) {
$this->updateDisallowedQuotaSites();
}
- if ($this->isAScheduler() && $cull_now_non_crawlable) {
+ if ($this->isAScheduler() && $cull_now_non_crawlable
+ && $this->crawl_type != self::ARCHIVE_CRAWL) {
L\crawlLog("Scheduler: Allowed/Disallowed Urls have changed");
L\crawlLog("Scheduler: Checking if urls in queue need" .
" to be culled");
diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index 9bc901c8b..8eb4f8c72 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -261,8 +261,8 @@ class PhraseParser
$triplets_list = $tokenizer->extractTripletsPhrases(
$phrase_and_sentences["SENTENCES"], $lang);
$phrase_and_sentences["TERMS_AND_PHRASES"] =
- array_merge($phrase_and_sentences["TERMS_AND_PHRASES"],
- $triplets_list['QUESTION_LIST']);
+ $phrase_and_sentences["TERMS_AND_PHRASES"] +
+ $triplets_list['QUESTION_LIST'];
$phrase_list['QUESTION_ANSWER_LIST'] =
$triplets_list['QUESTION_ANSWER_LIST'];
$phrase_list['TIMES']['QUESTION_ANSWER_EXTRACT'] =
diff --git a/tests/PhraseParserTest.php b/tests/PhraseParserTest.php
index f40683adb..76d927311 100644
--- a/tests/PhraseParserTest.php
+++ b/tests/PhraseParserTest.php
@@ -63,8 +63,9 @@ class PhraseParserTest extends UnitTest
$phrase_string = <<< EOD
Dr. T.Y Lin's home page. J. R. R. Tolkien
EOD;
- $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+ $extracted_data = PhraseParser::extractPhrasesInLists($phrase_string,
"en-US");
+ $word_lists = $extracted_data['WORD_LIST'];
$words = array_keys($word_lists);
$this->assertTrue(in_array("dr", $words), "Abbreviation 1");
$this->assertTrue(in_array("_ty", $words), "Initials 1");
@@ -76,10 +77,10 @@ THE THE
©2012
reddit: the front page of the internet
EOD;
- $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+ $extracted_data = PhraseParser::extractPhrasesInLists($phrase_string,
"en-US");
+ $word_lists = $extracted_data['WORD_LIST'];
$words = array_keys($word_lists);
-
$this->assertTrue(in_array("the the", $words), "Extract Bigram 1");
$this->assertTrue(in_array("deep space", $words), "Extract Bigram 2");
$this->assertTrue(in_array("deep", $words), "Unigrams still present 1");
@@ -92,8 +93,9 @@ EOD;
拼音 关闭 空间 百科 hao123 | 更多>>
About Baidu
EOD;
- $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+ $extracted_data = PhraseParser::extractPhrasesInLists($phrase_string,
"zh-CN");
+ $word_lists = $extracted_data['WORD_LIST'];
$words = array_keys($word_lists);
$this->assertTrue(in_array("百科", $words), "Chinese test 1");
$this->assertTrue(in_array("baidu", $words), "Chinese test 2");
@@ -115,8 +117,9 @@ http://yo.lo.edu/faculty_pages/zebra/
A&W a&TT chris@pollett.org
Fish 'n chips
EOD;
- $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+ $extracted_data = PhraseParser::extractPhrasesInLists($phrase_string,
"en-US");
+ $word_lists = $extracted_data['WORD_LIST'];
$words = array_keys($word_lists);
$this->assertTrue(in_array("_po", $words), "Acronym Test 1");
$this->assertTrue(in_array("_uk", $words), "Acronym Test 2");
@@ -156,8 +159,9 @@ small table the the the the the the the the the the the their there there
this those three to to to trap uncle uncle wagon walls was was was was was
were where which which whirlwinds who who wife with
EOD;
- $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+ $extracted_data = PhraseParser::extractPhrasesInLists($phrase_string,
"en-US");
+ $word_lists = $extracted_data['WORD_LIST'];
$len = strlen($phrase_string);
$score = PhraseParser::computeSafeSearchScore($word_lists, $len);
$this->assertTrue(($score < 0.012), "Easy Safe Test 1");
@@ -168,8 +172,9 @@ for from grown has how in is isnt knot lolita matts monster pussies ready
she she shew slut teens their thom them thought they're tight to to to total
up use whether
EOD;
- $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+ $extracted_data = PhraseParser::extractPhrasesInLists($phrase_string,
"en-US");
+ $word_lists = $extracted_data['WORD_LIST'];
$len = strlen($phrase_string);
$score = PhraseParser::computeSafeSearchScore($word_lists, $len);
$this->assertTrue(($score > 0.012), "Easy Unsafe Test 1");
@@ -188,6 +193,7 @@ you
EOD;
$word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
"en-US");
+ $word_lists = $extracted_data['WORD_LIST'];
$len = strlen($phrase_string);
$score = PhraseParser::computeSafeSearchScore($word_lists, $len);
$this->assertTrue(($score > 0.012), "Harder Unsafe Test 1");
@@ -203,8 +209,9 @@ over parents process reproduce reproduce result sex sex sexual
sexual small specialist specialized specific such that that the the the
the their to to traits traits transport two types variety while young
EOD;
- $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+ $extracted_data = PhraseParser::extractPhrasesInLists($phrase_string,
"en-US");
+ $word_lists = $extracted_data['WORD_LIST'];
$len = strlen($phrase_string);
$score = PhraseParser::computeSafeSearchScore($word_lists, $len);
$this->assertTrue(($score < 0.012), "Harder Safe Test 1");
@@ -216,8 +223,9 @@ lesbian may moist verb object of of or or or others secondary refer relay
romantic same sex sexual trim the the the them to to to to to used
used who who wide women ward
EOD;
- $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+ $extracted_data = PhraseParser::extractPhrasesInLists($phrase_string,
"en-US");
+ $word_lists = $extracted_data['WORD_LIST'];
$len = strlen($phrase_string);
$score = PhraseParser::computeSafeSearchScore($word_lists, $len);
$this->assertTrue(($score < 0.012), "Harder Safe Test 2");