diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php index 3699ef265..966f9c685 100755 --- a/src/executables/ArcTool.php +++ b/src/executables/ArcTool.php @@ -210,8 +210,7 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants */ public function outputArchiveList() { - $yioop_pattern = C\CRAWL_DIR."/cache/{*-" . self::archive_base_name . - "," . self::archive_base_name . "," . + $yioop_pattern = C\CRAWL_DIR."/cache/{" . self::double_index_base_name . "," . self::index_data_base_name . "}*"; $archives = glob($yioop_pattern, GLOB_BRACE); diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php index 93a53ba9e..88c7fde9b 100755 --- a/src/library/PhraseParser.php +++ b/src/library/PhraseParser.php @@ -132,7 +132,6 @@ class PhraseParser $index_name = null, $exact_match = false, $threshold = C\MIN_RESULTS_TO_GROUP) { - $index_name = (empty($index_name) || is_integer($index_name) || $index_name[0] != '-') ? $index_name : substr($index_name, 1); @@ -231,11 +230,13 @@ class PhraseParser method_exists($tokenizer, "extractTripletsPhrases")) { $triplets_list = $tokenizer->extractTripletsPhrases( $phrase_and_sentences["SENTENCES"], $lang); - $phrase_and_sentences["TERM_POSITIONS"] = - $phrase_and_sentences["TERM_POSITIONS"] + - $triplets_list['QUESTION_LIST']; - $phrase_list['QUESTION_ANSWER_LIST'] = - $triplets_list['QUESTION_ANSWER_LIST']; + if (!empty($triplets_list['QUESTION_LIST'])) { + $phrase_and_sentences["TERM_POSITIONS"] = + $phrase_and_sentences["TERM_POSITIONS"] + + $triplets_list['QUESTION_LIST']; + $phrase_list['QUESTION_ANSWER_LIST'] = + $triplets_list['QUESTION_ANSWER_LIST']; + } $phrase_list['TIMES']['QUESTION_ANSWER_EXTRACT'] = changeInMicrotime($qa_start_time); } @@ -264,22 +265,24 @@ class PhraseParser $t = 1; /*first position in doc is 1 as will encode with modified9 which requires positive numbers */ - // add all single terms - foreach ($terms as $term) { - if (!isset($pos_lists[$term])) { - $pos_lists[$term] = []; + if (strpos($string ?? "", "-") === false) { + foreach ($terms as $term) { + $pos_lists[$term][] = $t++; } - $pos_lists[$term][] = $t; - // this is to allow for searching by entities and parts of entities - if (count($term_parts) > 1) { + } else { + // add all single terms in entity + foreach ($terms as $term) { + $pos_lists[$term][] = $t; + /* this is to allow for searching by entities and parts + of entities + */ + $term_parts = explode("-", $term); + array_shift($term_parts); foreach($term_parts as $part) { - if (!isset($pos_lists[$part])) { - $pos_lists[$part] = []; - } $pos_lists[$part][] = $t; } + $t++; } - $t++; } return $pos_lists; } @@ -424,44 +427,41 @@ class PhraseParser $t = 1; /*first position in doc is 1 as will encode with modified9 which requires positive numbers */ - // add all single terms - foreach ($terms as $term) { - if (!isset($pos_lists[$term])) { - $pos_lists[$term] = []; + if (strpos($string ?? "", "-") === false) { + foreach ($terms as $term) { + $pos_lists[$term][] = $t++; } - $pos_lists[$term][] = $t; - $term_parts = explode("-", $term ?? ""); - // this is to allow for searching by entities and parts of entities - if (count($term_parts) > 1) { + } else { + // add all single terms in entity + foreach ($terms as $term) { + $pos_lists[$term][] = $t; + /* this is to allow for searching by entities and parts + of entities + */ + + $term_parts = explode("-", $term ?? ""); + array_shift($term_parts); foreach($term_parts as $part) { - if (!isset($pos_lists[$part])) { - $pos_lists[$part] = []; - } $pos_lists[$part][] = $t; } + $t++; } - $t++; } $out["TERM_POSITIONS"] = $pos_lists; $tokenizer = self::getTokenizer($lang); if ($extract_sentences && !empty($tokenizer) && method_exists($tokenizer, "tagTokenizePartOfSpeech") && !isset(self::$programming_language_map[$lang])) { - $string = mb_strtolower($string); - $sentences = preg_split("/(\n\n+)|\.|\!|\?|。|!|?/u", $string); - $pos = 1; + $string = 'Zdummy. ' . mb_strtolower($string); + $sentences = preg_split( + '/\s*((\n\n+)|\.[\D]|\.$|\!|\?|。|!|?)\s*/u', $string, -1, + PREG_SPLIT_OFFSET_CAPTURE); $sentences_pos = []; - foreach ($sentences as $sentence) { - $sentence = trim($sentence); - if (!empty($sentence)) { - if (empty($sentences_pos[$sentence])) { - $sentences_pos[$sentence] = [$pos]; - } else { - $sentences_pos[$sentence][] = $pos; - } - $pos += str_word_count($sentence); - } + foreach ($sentences as $sentence_data) { + list($sentence, $pos) = $sentence_data; + $sentences_pos[$sentence][] = $pos; } + unset($sentences_pos[""], $sentences_pos["Zdummy"]); $out["SENTENCES"] = $sentences_pos; } return $out; @@ -483,8 +483,11 @@ class PhraseParser $to_string = false) { static $non_hyphens = ""; + static $segment_char_gram_lang = []; if (empty($non_hyphens)) { $non_hyphens = str_replace("-|", "", C\PUNCT); + $segment_char_gram_lang = ['zh-CN', "ko", "ja", 'bn', 'he', 'hi', + 'id', 'kn', 'pl', 'te', 'th', 'tl']; } if (isset(self::$programming_language_map[$lang])) { mb_internal_encoding("UTF-8"); @@ -498,8 +501,12 @@ class PhraseParser $string = preg_replace('/(,:)\p{P}/u', "", $string); } $string = mb_ereg_replace("\s+|$non_hyphens", " ", $string); - $terms = self::segmentSegment($string, $lang); - $terms = self::charGramTerms($terms, $lang); + $tokenizer = self::getTokenizer($lang); + $terms = $string; + if (in_array($lang, $segment_char_gram_lang)) { + $terms = self::segmentSegment($terms, $lang); + $terms = self::charGramTerms($terms, $lang); + } $terms = self::stemTerms($terms, $lang); } if ($to_string) { @@ -902,28 +909,33 @@ class PhraseParser */ public static function stemTermsK($string_or_array, $lang, $keep_empties) { - if ($string_or_array == [] || - $string_or_array == "") { return [];} - if (is_array($string_or_array)) { - $terms = $string_or_array; - } else { - $terms = mb_split("[[:space:]]", $string_or_array); + if (empty($string_or_array)) { + return []; } - $stem_obj = self::getTokenizer($lang); - $stems = []; - if (!empty($stem_obj) && method_exists($stem_obj, "stem")) { - foreach ($terms as $term) { - if (trim($term) == "") { - if (!$keep_empties) { - continue; + if (is_array($string_or_array)) { + $terms = []; + if ($keep_empties) { + $terms = $string_or_array; + } else { + foreach ($string_or_array as $pre_term) { + $term = trim($pre_term); + if (!empty($term)) { + $terms[] = $term; } } - $stems[] = (strpos($term, "_") === false) ? - $stem_obj->stem($term) : $term; } } else { + $terms = mb_split("[[:space:]]", $string_or_array); + } + $stem_obj = self::getTokenizer($lang); + if (empty($stem_obj) || !method_exists($stem_obj, "stem")) { return $terms; } + $stems = []; + foreach ($terms as $term) { + $stems[] = (strpos($term, "_") === false) ? + $stem_obj->stem($term) : $term; + } return $stems; } /** diff --git a/src/views/layouts/WebLayout.php b/src/views/layouts/WebLayout.php index cff0b20b2..0d344c1a9 100755 --- a/src/views/layouts/WebLayout.php +++ b/src/views/layouts/WebLayout.php @@ -427,7 +427,7 @@ class WebLayout extends Layout } if (isset($data['DISPLAY_MESSAGE'])) { /*using double quotes in case message string has single quotes*/ -+ e("\ndoMessage(\"<h1 class='display-message' >" . + e("\ndoMessage(\"<h1 class='display-message' >" . $data['DISPLAY_MESSAGE'] . "</h1>\");"); } ?>;/*keep semi-colon just in case inserted JS didn't have */