Tweaks to guessLocaleFromString where getting nulls from stopWordsRemovers, a=chris

Chris Pollett [2022-08-02 23:Aug:nd]
Tweaks to guessLocaleFromString where getting nulls from stopWordsRemovers, a=chris
Filename
src/controllers/components/SystemComponent.php
src/library/LocaleFunctions.php
src/library/index_bundle_iterators/WordIterator.php
diff --git a/src/controllers/components/SystemComponent.php b/src/controllers/components/SystemComponent.php
index bd6246296..26935e562 100755
--- a/src/controllers/components/SystemComponent.php
+++ b/src/controllers/components/SystemComponent.php
@@ -1345,6 +1345,8 @@ EOD;
                 "check"=>"mb_internal_encoding", "type"=>"function"],
             [   "name" => "PDO SQLite3 Library",
                 "check"=>"\PDO", "type"=>"class"],
+            [   "name" => "PHP intl",
+                "check"=>"datefmt_create", "type"=>"function"],
             [   "name" =>
                     "Process Creation Functions (popen, pclose, and exec".
                     " needed for crawling)",
diff --git a/src/library/LocaleFunctions.php b/src/library/LocaleFunctions.php
index de9df21e3..cf60ff45b 100755
--- a/src/library/LocaleFunctions.php
+++ b/src/library/LocaleFunctions.php
@@ -47,9 +47,9 @@ require_once __DIR__."/../configs/Config.php";
  */
 function localesWithStopwordsList()
 {
-    return ['ar', 'bn', 'de', 'en-US', 'es', 'fa', 'fr-FR', 'he', 'hi',
-        'in-ID', 'it', 'ja', 'kn', 'ko', 'nl', 'pl', 'pt', 'ru', 'te', 'th',
-        'vi-VN', 'zh-CN'];
+    return ['ar', 'bn', 'de', 'el-GR', 'en-US', 'es', 'fa', 'fr-FR', 'he', 'hi',
+        'id', 'it', 'ja', 'kn', 'ko', 'nl', 'pl', 'pt', 'ru', 'te', 'th', 'tl',
+        'tr', 'vi-VN', 'zh-CN'];
 }
 /**
  * Converts a $locale_tag (major-minor) to an Iso 632-2 language name
@@ -148,8 +148,16 @@ function guessLocaleFromString($phrase_string, $locale_tag = null)
         foreach (localesWithStopwordsList() as $lang) {
             $tokenizer = PhraseParser::getTokenizer($lang);
             if ($tokenizer) {
-                $test_len =
-                    strlen($tokenizer->stopwordsRemover($guess_string) ?? "");
+                $guess_string = mb_convert_encoding($guess_string, "UTF-8");
+                $compressed_string =
+                    $tokenizer->stopwordsRemover($guess_string);
+                /* if regex fails might get null. Usually caused by bad encoding
+                   of $guess_string
+                 */
+                if (!is_string($compressed_string)) {
+                    break;
+                }
+                $test_len = strlen($compressed_string);
                 if ($test_len < $len) {
                     $len = $test_len;
                     $locale_tag = $lang;
diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php
index c48145745..85b5d1265 100644
--- a/src/library/index_bundle_iterators/WordIterator.php
+++ b/src/library/index_bundle_iterators/WordIterator.php
@@ -938,7 +938,11 @@ class WordIterator extends IndexBundleIterator
                 $this->archive_file, $generation_info['POSTINGS'],
                 $generation_info['LAST_BLOB_LEN']);
         }
-        list($postings,) = $index->unpackPostings($postings_entry);
+        if (empty($postings_entry)) {
+            $postings = [];
+        } else {
+            list($postings,) = $index->unpackPostings($postings_entry);
+        }
         $this->dictionary_info[$generation]['POSTINGS'] = $postings;
         unset($this->dictionary_info[$generation]['LAST_BLOB_LEN']);
         return $postings;
ViewGit