Minor optimizations to extractPhrasesInLists, a=chris

Chris Pollett [2022-08-16 06:Aug:th]
Minor optimizations to extractPhrasesInLists, a=chris
Filename
src/executables/ArcTool.php
src/library/PhraseParser.php
src/views/layouts/WebLayout.php
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php
index 3699ef265..966f9c685 100755
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
@@ -210,8 +210,7 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants
      */
      public function outputArchiveList()
      {
-        $yioop_pattern = C\CRAWL_DIR."/cache/{*-" . self::archive_base_name .
-            "," . self::archive_base_name . "," .
+        $yioop_pattern = C\CRAWL_DIR."/cache/{" .
             self::double_index_base_name . "," . self::index_data_base_name .
             "}*";
         $archives = glob($yioop_pattern, GLOB_BRACE);
diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index 93a53ba9e..88c7fde9b 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -132,7 +132,6 @@ class PhraseParser
         $index_name = null, $exact_match = false, $threshold =
         C\MIN_RESULTS_TO_GROUP)
     {
-
         $index_name = (empty($index_name) || is_integer($index_name)
             || $index_name[0] != '-') ?
             $index_name : substr($index_name, 1);
@@ -231,11 +230,13 @@ class PhraseParser
                 method_exists($tokenizer, "extractTripletsPhrases")) {
                 $triplets_list = $tokenizer->extractTripletsPhrases(
                     $phrase_and_sentences["SENTENCES"], $lang);
-                $phrase_and_sentences["TERM_POSITIONS"] =
-                    $phrase_and_sentences["TERM_POSITIONS"] +
-                   $triplets_list['QUESTION_LIST'];
-                $phrase_list['QUESTION_ANSWER_LIST'] =
-                    $triplets_list['QUESTION_ANSWER_LIST'];
+                if (!empty($triplets_list['QUESTION_LIST'])) {
+                    $phrase_and_sentences["TERM_POSITIONS"] =
+                        $phrase_and_sentences["TERM_POSITIONS"] +
+                       $triplets_list['QUESTION_LIST'];
+                    $phrase_list['QUESTION_ANSWER_LIST'] =
+                        $triplets_list['QUESTION_ANSWER_LIST'];
+                }
                 $phrase_list['TIMES']['QUESTION_ANSWER_EXTRACT'] =
                     changeInMicrotime($qa_start_time);
             }
@@ -264,22 +265,24 @@ class PhraseParser
         $t = 1; /*first position in doc is 1 as will encode with modified9
              which requires positive numbers
         */
-        // add all single terms
-        foreach ($terms as $term) {
-            if (!isset($pos_lists[$term])) {
-                $pos_lists[$term] = [];
+        if (strpos($string ?? "", "-") === false) {
+            foreach ($terms as $term) {
+                $pos_lists[$term][] = $t++;
             }
-            $pos_lists[$term][] = $t;
-            // this is to allow for searching by entities and parts of entities
-            if (count($term_parts) > 1) {
+        } else {
+            // add all single terms in entity
+            foreach ($terms as $term) {
+                $pos_lists[$term][] = $t;
+                /* this is to allow for searching by entities and parts
+                   of entities
+                 */
+                $term_parts = explode("-", $term);
+                array_shift($term_parts);
                 foreach($term_parts as $part) {
-                    if (!isset($pos_lists[$part])) {
-                        $pos_lists[$part] = [];
-                    }
                     $pos_lists[$part][] = $t;
                 }
+                $t++;
             }
-            $t++;
         }
         return $pos_lists;
     }
@@ -424,44 +427,41 @@ class PhraseParser
         $t = 1; /*first position in doc is 1 as will encode with modified9
              which requires positive numbers
         */
-        // add all single terms
-        foreach ($terms as $term) {
-            if (!isset($pos_lists[$term])) {
-                $pos_lists[$term] = [];
+        if (strpos($string ?? "", "-") === false) {
+            foreach ($terms as $term) {
+                $pos_lists[$term][] = $t++;
             }
-            $pos_lists[$term][] = $t;
-            $term_parts = explode("-", $term ?? "");
-            // this is to allow for searching by entities and parts of entities
-            if (count($term_parts) > 1) {
+        } else {
+            // add all single terms in entity
+            foreach ($terms as $term) {
+                $pos_lists[$term][] = $t;
+                /* this is to allow for searching by entities and parts
+                   of entities
+                 */
+
+                $term_parts = explode("-", $term ?? "");
+                array_shift($term_parts);
                 foreach($term_parts as $part) {
-                    if (!isset($pos_lists[$part])) {
-                        $pos_lists[$part] = [];
-                    }
                     $pos_lists[$part][] = $t;
                 }
+                $t++;
             }
-            $t++;
         }
         $out["TERM_POSITIONS"] = $pos_lists;
         $tokenizer = self::getTokenizer($lang);
         if ($extract_sentences && !empty($tokenizer) &&
             method_exists($tokenizer, "tagTokenizePartOfSpeech") &&
             !isset(self::$programming_language_map[$lang])) {
-            $string = mb_strtolower($string);
-            $sentences = preg_split("/(\n\n+)|\.|\!|\?|。|!|?/u", $string);
-            $pos = 1;
+            $string = 'Zdummy. ' . mb_strtolower($string);
+            $sentences = preg_split(
+                '/\s*((\n\n+)|\.[\D]|\.$|\!|\?|。|!|?)\s*/u', $string, -1,
+                PREG_SPLIT_OFFSET_CAPTURE);
             $sentences_pos = [];
-            foreach ($sentences as $sentence) {
-                $sentence = trim($sentence);
-                if (!empty($sentence)) {
-                    if (empty($sentences_pos[$sentence])) {
-                        $sentences_pos[$sentence] = [$pos];
-                    } else {
-                        $sentences_pos[$sentence][] = $pos;
-                    }
-                    $pos += str_word_count($sentence);
-                }
+            foreach ($sentences as $sentence_data) {
+                list($sentence, $pos) = $sentence_data;
+                $sentences_pos[$sentence][] = $pos;
             }
+            unset($sentences_pos[""], $sentences_pos["Zdummy"]);
             $out["SENTENCES"] = $sentences_pos;
         }
         return $out;
@@ -483,8 +483,11 @@ class PhraseParser
         $to_string = false)
     {
         static $non_hyphens = "";
+        static $segment_char_gram_lang = [];
         if (empty($non_hyphens)) {
             $non_hyphens = str_replace("-|", "", C\PUNCT);
+            $segment_char_gram_lang = ['zh-CN', "ko", "ja", 'bn', 'he', 'hi',
+                'id', 'kn', 'pl', 'te', 'th', 'tl'];
         }
         if (isset(self::$programming_language_map[$lang])) {
             mb_internal_encoding("UTF-8");
@@ -498,8 +501,12 @@ class PhraseParser
                 $string = preg_replace('/(,:)\p{P}/u', "", $string);
             }
             $string = mb_ereg_replace("\s+|$non_hyphens", " ", $string);
-            $terms = self::segmentSegment($string, $lang);
-            $terms = self::charGramTerms($terms, $lang);
+            $tokenizer = self::getTokenizer($lang);
+            $terms = $string;
+            if (in_array($lang, $segment_char_gram_lang)) {
+                $terms = self::segmentSegment($terms, $lang);
+                $terms = self::charGramTerms($terms, $lang);
+            }
             $terms = self::stemTerms($terms, $lang);
         }
         if ($to_string) {
@@ -902,28 +909,33 @@ class PhraseParser
      */
     public static function stemTermsK($string_or_array, $lang, $keep_empties)
     {
-        if ($string_or_array == [] ||
-            $string_or_array == "") { return [];}
-        if (is_array($string_or_array)) {
-            $terms = $string_or_array;
-        } else {
-            $terms = mb_split("[[:space:]]", $string_or_array);
+        if (empty($string_or_array)) {
+            return [];
         }
-        $stem_obj = self::getTokenizer($lang);
-        $stems = [];
-        if (!empty($stem_obj) && method_exists($stem_obj, "stem")) {
-            foreach ($terms as $term) {
-                if (trim($term) == "") {
-                    if (!$keep_empties) {
-                        continue;
+        if (is_array($string_or_array)) {
+            $terms = [];
+            if ($keep_empties) {
+                $terms = $string_or_array;
+            } else {
+                foreach ($string_or_array as $pre_term) {
+                    $term = trim($pre_term);
+                    if (!empty($term)) {
+                        $terms[] = $term;
                     }
                 }
-                $stems[] = (strpos($term, "_") === false) ?
-                    $stem_obj->stem($term) : $term;
             }
         } else {
+            $terms = mb_split("[[:space:]]", $string_or_array);
+        }
+        $stem_obj = self::getTokenizer($lang);
+        if (empty($stem_obj) || !method_exists($stem_obj, "stem")) {
             return $terms;
         }
+        $stems = [];
+        foreach ($terms as $term) {
+            $stems[] = (strpos($term, "_") === false) ?
+                $stem_obj->stem($term) : $term;
+        }
         return $stems;
     }
     /**
diff --git a/src/views/layouts/WebLayout.php b/src/views/layouts/WebLayout.php
index cff0b20b2..0d344c1a9 100755
--- a/src/views/layouts/WebLayout.php
+++ b/src/views/layouts/WebLayout.php
@@ -427,7 +427,7 @@ class WebLayout extends Layout
         }
         if (isset($data['DISPLAY_MESSAGE'])) {
             /*using double quotes in case message string has single quotes*/
-+            e("\ndoMessage(\"<h1 class='display-message' >" .
+            e("\ndoMessage(\"<h1 class='display-message' >" .
                 $data['DISPLAY_MESSAGE'] .
                 "</h1>\");");
         } ?>;/*keep semi-colon just in case inserted JS didn't have */
ViewGit