Fixes so PhraseParser unit tests pass and so that archive crawls dont crash queue server, a=chris

Chris Pollett [2017-01-11 23:Jan:th]
Fixes so PhraseParser unit tests pass and so that archive crawls dont crash queue server, a=chris
Filename
src/controllers/SearchController.php
src/executables/QueueServer.php
src/library/PhraseParser.php
tests/PhraseParserTest.php
diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php
index bc860005f..43ee95d38 100755
--- a/src/controllers/SearchController.php
+++ b/src/controllers/SearchController.php
@@ -1394,11 +1394,12 @@ class SearchController extends Controller implements CrawlConstants
         if (count($instance_parts) > 1) {
             $instance_num = intval($instance_parts[0]);
         }
-        $offset = $crawl_item[self::OFFSET];
-        $cache_partition = $crawl_item[self::CACHE_PAGE_PARTITION];
-        $cache_item = $crawl_model->getCacheFile($machine,
-            $machine_uri, $cache_partition, $offset,  $crawl_time,
-            $instance_num);
+        if (!empty($crawl_item[self::OFFSET])) {
+            $cache_partition = $crawl_item[self::CACHE_PAGE_PARTITION];
+            $cache_item = $crawl_model->getCacheFile($machine,
+                $machine_uri, $cache_partition, $crawl_item[self::OFFSET],
+                $crawl_time, $instance_num);
+        }
         if (!isset($cache_item[self::PAGE])) {
             $data["URL"] = $url;
             $data["SUMMARY_STRING"] =
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 2d5b93465..85b0c66d3 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -418,14 +418,10 @@ class QueueServer implements CrawlConstants, Join
                     $this->web_queue->to_crawl_queue->count);
             }
             $start_loop_time = time();
-
             //check and update if necessary the crawl params of current crawl
             $this->checkUpdateCrawlParameters();
-
             $this->updateMostRecentFetcher();
-
             $this->processCrawlData();
-
             $time_diff = time() - $start_loop_time;
             if ($time_diff < C\QUEUE_SLEEP_TIME) {
                 L\crawlLog("Sleeping...");
@@ -1335,7 +1331,8 @@ class QueueServer implements CrawlConstants, Join
         if ($update_disallow == true) {
             $this->updateDisallowedQuotaSites();
         }
-        if ($this->isAScheduler() && $cull_now_non_crawlable) {
+        if ($this->isAScheduler() && $cull_now_non_crawlable
+            && $this->crawl_type != self::ARCHIVE_CRAWL) {
             L\crawlLog("Scheduler: Allowed/Disallowed Urls have changed");
             L\crawlLog("Scheduler: Checking if urls in queue need" .
                 " to be culled");
diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index 9bc901c8b..8eb4f8c72 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -261,8 +261,8 @@ class PhraseParser
                 $triplets_list = $tokenizer->extractTripletsPhrases(
                     $phrase_and_sentences["SENTENCES"], $lang);
                 $phrase_and_sentences["TERMS_AND_PHRASES"] =
-                    array_merge($phrase_and_sentences["TERMS_AND_PHRASES"],
-                    $triplets_list['QUESTION_LIST']);
+                    $phrase_and_sentences["TERMS_AND_PHRASES"] +
+                   $triplets_list['QUESTION_LIST'];
                 $phrase_list['QUESTION_ANSWER_LIST'] =
                     $triplets_list['QUESTION_ANSWER_LIST'];
                 $phrase_list['TIMES']['QUESTION_ANSWER_EXTRACT'] =
diff --git a/tests/PhraseParserTest.php b/tests/PhraseParserTest.php
index f40683adb..76d927311 100644
--- a/tests/PhraseParserTest.php
+++ b/tests/PhraseParserTest.php
@@ -63,8 +63,9 @@ class PhraseParserTest extends UnitTest
         $phrase_string = <<< EOD
 Dr. T.Y Lin's home page. J. R. R. Tolkien
 EOD;
-        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+        $extracted_data = PhraseParser::extractPhrasesInLists($phrase_string,
             "en-US");
+        $word_lists = $extracted_data['WORD_LIST'];
         $words = array_keys($word_lists);
         $this->assertTrue(in_array("dr", $words), "Abbreviation 1");
         $this->assertTrue(in_array("_ty", $words), "Initials 1");
@@ -76,10 +77,10 @@ THE THE
 ©2012
 reddit: the front page of the internet
 EOD;
-        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+        $extracted_data = PhraseParser::extractPhrasesInLists($phrase_string,
             "en-US");
+        $word_lists = $extracted_data['WORD_LIST'];
         $words = array_keys($word_lists);
-
         $this->assertTrue(in_array("the the", $words), "Extract Bigram 1");
         $this->assertTrue(in_array("deep space", $words), "Extract Bigram 2");
         $this->assertTrue(in_array("deep", $words), "Unigrams still present 1");
@@ -92,8 +93,9 @@ EOD;
 拼音 关闭 空间 百科 hao123 | 更多>>
 About Baidu
 EOD;
-        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+        $extracted_data = PhraseParser::extractPhrasesInLists($phrase_string,
             "zh-CN");
+        $word_lists = $extracted_data['WORD_LIST'];
         $words = array_keys($word_lists);
         $this->assertTrue(in_array("百科", $words), "Chinese test 1");
         $this->assertTrue(in_array("baidu", $words), "Chinese test 2");
@@ -115,8 +117,9 @@ http://yo.lo.edu/faculty_pages/zebra/
 A&W a&TT chris@pollett.org
 Fish 'n chips
 EOD;
-        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+        $extracted_data = PhraseParser::extractPhrasesInLists($phrase_string,
             "en-US");
+        $word_lists = $extracted_data['WORD_LIST'];
         $words = array_keys($word_lists);
         $this->assertTrue(in_array("_po", $words), "Acronym Test 1");
         $this->assertTrue(in_array("_uk", $words), "Acronym Test 2");
@@ -156,8 +159,9 @@ small table the the the the the the the the the the the their there there
 this those three to to to trap uncle uncle wagon walls was was was was was
 were where which which whirlwinds who who wife with
 EOD;
-        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+        $extracted_data = PhraseParser::extractPhrasesInLists($phrase_string,
             "en-US");
+        $word_lists = $extracted_data['WORD_LIST'];
         $len = strlen($phrase_string);
         $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
         $this->assertTrue(($score < 0.012), "Easy Safe Test 1");
@@ -168,8 +172,9 @@ for from grown has how in is isnt knot lolita matts monster pussies ready
 she she shew slut teens their thom them thought they're tight to to to total
 up use whether
 EOD;
-        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+        $extracted_data = PhraseParser::extractPhrasesInLists($phrase_string,
             "en-US");
+        $word_lists = $extracted_data['WORD_LIST'];
         $len = strlen($phrase_string);
         $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
         $this->assertTrue(($score > 0.012), "Easy Unsafe Test 1");
@@ -188,6 +193,7 @@ you
 EOD;
         $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
             "en-US");
+        $word_lists = $extracted_data['WORD_LIST'];
         $len = strlen($phrase_string);
         $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
         $this->assertTrue(($score > 0.012), "Harder Unsafe Test 1");
@@ -203,8 +209,9 @@ over parents process reproduce reproduce result sex sex sexual
 sexual small specialist specialized specific such that that the the the
 the their to to traits traits transport two types variety while young
 EOD;
-        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+        $extracted_data = PhraseParser::extractPhrasesInLists($phrase_string,
             "en-US");
+        $word_lists = $extracted_data['WORD_LIST'];
         $len = strlen($phrase_string);
         $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
         $this->assertTrue(($score < 0.012), "Harder Safe Test 1");
@@ -216,8 +223,9 @@ lesbian may moist verb object of of or or or others secondary refer relay
 romantic same sex sexual trim the the the them to to to to to used
 used who who wide women ward
 EOD;
-        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+        $extracted_data = PhraseParser::extractPhrasesInLists($phrase_string,
             "en-US");
+        $word_lists = $extracted_data['WORD_LIST'];
         $len = strlen($phrase_string);
         $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
         $this->assertTrue(($score < 0.012), "Harder Safe Test 2");
ViewGit