diff --git a/src/configs/Config.php b/src/configs/Config.php index f3c8fa999..2097fc733 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -872,7 +872,9 @@ nsconddefine('ENABLE_QUESTION_ANSWERING', true); query over those terms */ nsconddefine("SUFFIX_PHRASES", false); -/** Number of words until to switch from bag of words to phrase lookup */ +/** Number of words until to switch from bag of words to phrase lookup + * if SUFFIX_PHRASES is true + */ nsconddefine('PHRASE_THRESHOLD', 3); /** default number of search results to display per page */ nsconddefine('NUM_RESULTS_PER_PAGE', 10); diff --git a/src/controllers/FetchController.php b/src/controllers/FetchController.php index 76876724a..1ef5a5783 100755 --- a/src/controllers/FetchController.php +++ b/src/controllers/FetchController.php @@ -292,7 +292,7 @@ class FetchController extends Controller implements CrawlConstants self::index_closed_name . $crawl_time . ".txt"; if ($crawl_time > 0 && file_exists($index_schedule_file) && $check_crawl_time > intval(fileatime($index_schedule_file)) && - !file_exists(C\CRAWL_DIR. + !file_exists(C\CRAWL_DIR . "/schedules/QueueServerMessages.txt") ) { $restart = true; if (file_exists($this->crawl_status_file_name)) { diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php index dab0de7ca..cc6c56274 100755 --- a/src/controllers/SearchController.php +++ b/src/controllers/SearchController.php @@ -535,7 +535,7 @@ class SearchController extends Controller implements CrawlConstants } } } - if (isset($_REQUEST['save_timestamp'])){ + if (isset($_REQUEST['save_timestamp'])) { $save_timestamp = substr($this->clean( $_REQUEST['save_timestamp'], 'int'), 0, C\TIMESTAMP_LEN); } else { diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 5e12b6271..d9adcaa2c 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -581,8 +581,8 @@ class Fetcher implements CrawlConstants static $last_record_time = 0; L\crawlLog("In Fetch Loop"); L\crawlLog("PHP Version in use: " . phpversion()); - $prefix = $this->fetcher_num."-"; - if (!file_exists(C\CRAWL_DIR."/{$prefix}temp")) { + $prefix = $this->fetcher_num . "-"; + if (!file_exists(C\CRAWL_DIR . "/{$prefix}temp")) { mkdir(C\CRAWL_DIR . "/{$prefix}temp"); } $info[self::STATUS] = self::CONTINUE_STATE; @@ -765,8 +765,8 @@ class Fetcher implements CrawlConstants - L\changeInMicrotime($start_time)))); return []; } - $prefix = $this->fetcher_num."-"; - $tmp_dir = C\CRAWL_DIR."/{$prefix}temp"; + $prefix = $this->fetcher_num . "-"; + $tmp_dir = C\CRAWL_DIR . "/{$prefix}temp"; $filtered_sites = []; $site_pages = []; foreach ($sites as $site) { @@ -867,9 +867,9 @@ class Fetcher implements CrawlConstants */ public function downloadPagesArchiveCrawl() { - $prefix = $this->fetcher_num."-"; + $prefix = $this->fetcher_num . "-"; $arc_name = "$prefix" . self::archive_base_name . $this->crawl_index; - $base_name = C\CRAWL_DIR."/cache/$arc_name"; + $base_name = C\CRAWL_DIR . "/cache/$arc_name"; $pages = []; if (!isset($this->archive_iterator->iterate_timestamp) || $this->archive_iterator->iterate_timestamp != $this->crawl_index || @@ -1106,7 +1106,7 @@ class Fetcher implements CrawlConstants if (L\generalIsA(C\NS_ARCHIVE . $this->arc_type . "Iterator", C\NS_ARCHIVE . "TextArchiveBundleIterator")) { $result_dir = C\WORK_DIRECTORY . "/schedules/" . - $prefix.self::fetch_archive_iterator . $this->crawl_time; + $prefix . self::fetch_archive_iterator . $this->crawl_time; $iterator_name = C\NS_ARCHIVE . $this->arc_type . "Iterator"; $this->archive_iterator = new $iterator_name( $info[self::CRAWL_INDEX], @@ -1252,14 +1252,15 @@ class Fetcher implements CrawlConstants $name_server = $this->name_server; $time = time(); $session = md5($time . C\AUTH_KEY); - $prefix = $this->fetcher_num."-"; + $prefix = $this->fetcher_num . "-" . $this->channel . "-"; $request = $name_server."?c=fetch&a=archiveSchedule&time=$time". - "&session=$session&robot_instance=".$prefix.C\ROBOT_INSTANCE. + "&session=$session&robot_instance=" . $prefix . C\ROBOT_INSTANCE. "&machine_uri=".C\WEB_URI."&crawl_time=".$this->crawl_time. - "&check_crawl_time=".$this->check_crawl_time; + "&check_crawl_time=" . $this->check_crawl_time; L\crawlLog($request); $response_string = FetchUrl::getPage($request, null, true); + echo $response_string; if ($response_string === false) { L\crawlLog("Request failed!"); return false; diff --git a/src/library/LocaleFunctions.php b/src/library/LocaleFunctions.php index ec294565c..b85aee8fc 100755 --- a/src/library/LocaleFunctions.php +++ b/src/library/LocaleFunctions.php @@ -101,16 +101,18 @@ function guessLocale() function guessLocaleFromString($phrase_string, $locale_tag = null) { $len = strlen($phrase_string); - foreach (['ar', 'bn', 'de', 'en-US', 'es', 'fa', 'fr-FR', 'he', 'hi', - 'in-ID', 'it', 'ja', 'kn', 'ko', 'nl', 'pl', 'pt', 'ru', 'te', 'th', - 'vi-VN', 'zh-CN'] as $lang) { - $tokenizer = PhraseParser::getTokenizer($lang); - if ($tokenizer) { - $test_len = - strlen($tokenizer->stopwordsRemover($phrase_string)); - if ($test_len < $len) { - $len = $test_len; - $locale_tag = $lang; + if (!$locale_tag || $len >= C\NAME_LEN) { + foreach (['ar', 'bn', 'de', 'en-US', 'es', 'fa', 'fr-FR', 'he', 'hi', + 'in-ID', 'it', 'ja', 'kn', 'ko', 'nl', 'pl', 'pt', 'ru', 'te', 'th', + 'vi-VN', 'zh-CN'] as $lang) { + $tokenizer = PhraseParser::getTokenizer($lang); + if ($tokenizer) { + $test_len = + strlen($tokenizer->stopwordsRemover($phrase_string)); + if ($test_len < $len) { + $len = $test_len; + $locale_tag = $lang; + } } } } diff --git a/src/library/archive_bundle_iterators/TextArchiveBundleIterator.php b/src/library/archive_bundle_iterators/TextArchiveBundleIterator.php index 0d694dabb..a2d2d1135 100644 --- a/src/library/archive_bundle_iterators/TextArchiveBundleIterator.php +++ b/src/library/archive_bundle_iterators/TextArchiveBundleIterator.php @@ -206,7 +206,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator } $this->num_partitions = count($this->partitions); $this->status_filename = "{$this->result_dir}/iterate_status.txt"; - $this->buffer_filename = $this->result_dir."/buffer.txt"; + $this->buffer_filename = $this->result_dir . "/buffer.txt"; if (file_exists($this->status_filename)) { $this->restoreCheckpoint(); diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index c782e9809..58dbcbc61 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -538,7 +538,8 @@ class PhraseModel extends ParallelModel $query_string = $query; $this->program_indicator = false; } - $locale_tag = L\guessLocaleFromString($query_string); + $locale_tag = L\guessLocale(); + $locale_tag = L\guessLocaleFromString($query_string, $locale_tag); $quote_state = false; $phrase_parts = explode('"', $phrase_string); $base_words = []; @@ -593,7 +594,7 @@ class PhraseModel extends ParallelModel $this->query_info['QUERY'] .= "$in3<i>Index</i>: ". $index_name."<br />"; $this->query_info['QUERY'] .= "$in3<i>LocaleTag</i>: ". - $locale_tag."<br />"; + $locale_tag ."<br />"; $this->query_info['QUERY'] .= "$in3<i>Stemmed/Char-grammed Words</i>:<br />"; foreach ($base_words as $word) {