diff --git a/src/configs/Config.php b/src/configs/Config.php index fcb4939fd..f3c8fa999 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -169,7 +169,7 @@ nsdefine('MIN_AD_VERSION', 36); * Version number for upgrading locale resource folders and for upgrading * public and help wikis */ -nsdefine('RESOURCES_WIKI_VERSION', 3); +nsdefine('RESOURCES_WIKI_VERSION', 4); /** * nsdefine's the BASE_URL constant for this script * if run from the command line as part of index.php HTTP server scrip diff --git a/src/index.php b/src/index.php index 8bbafea8e..840b91695 100644 --- a/src/index.php +++ b/src/index.php @@ -77,7 +77,7 @@ function bootstrap($web_site = null, $start_new_session = true) /** * Load global functions related to localization */ - require_once __DIR__."/library/LocaleFunctions.php"; + require_once __DIR__ . "/library/LocaleFunctions.php"; ini_set("memory_limit","500M"); if (!empty($web_site)) { if ((empty($_REQUEST['c']) || $_REQUEST['c'] != 'resource')) { diff --git a/src/library/LocaleFunctions.php b/src/library/LocaleFunctions.php index 220423453..ec294565c 100755 --- a/src/library/LocaleFunctions.php +++ b/src/library/LocaleFunctions.php @@ -100,64 +100,14 @@ function guessLocale() */ function guessLocaleFromString($phrase_string, $locale_tag = null) { - $original_phrase_string = $phrase_string; - $locale_tag = ($locale_tag == null) ? getLocaleTag() : $locale_tag; - $sub = C\PUNCT . "|[0-9]|\s"; - $phrase_string = preg_replace('/' . $sub . '/', "", $phrase_string); - $phrase_string = mb_convert_encoding($phrase_string, "UTF-32", "UTF-8"); $len = strlen($phrase_string); - $guess = ['ar' => 0, 'he' => 0, 'hi' => 0, 'ko' => 0, 'ja' => 0, 'ru' => 0, - 'th' => 0, 'vi' => 0, 'zh-CN' => 0]; - $guess[$locale_tag] = 1; - for ($i = 0; $i < $len; $i += 4) { - $start = ord($phrase_string[$i+2]); - $next = ord($phrase_string[$i+3]); - if ($start >= 6 && $start <= 7) { - if ($locale_tag == "fa") { - $guess[$locale_tag] +=2; - } else { - $guess['ar'] += 2; - } - } else if ($start == 5 && $next >= 144) { - $guess['he'] += 2; - } else if (($start == 9 && $next < 128) || ($start == 168 && - $next >= 224)) { - $guess['hi'] += 2; - } else if ($start == 17 || $start >= 172 && $start < 215) { - $guess['ko'] += 2; - } else if ($start >= 48 && $start <= 49) { - $guess['ja'] += 3; - } else if ($start == 4 || ($start == 5 && $next < 48)) { - $guess['ru']++; - } else if ($start == 14 && $next < 128) { - $guess['th'] += 2; - } else if (($start == 30 && in_array($next, [199, 209, 219])) || - ($start == 1 && in_array($next, [160, 161, 175, 176]))) { - $guess['vi'] += 7; - } else if ($start >= 78 && $start <= 159) { - $guess['zh-CN'] += 4; - } else if ($start == 0 && $next < 128) { - $guess[$locale_tag]++; // assume ascii is from $locale_tag - } - } - $num_points = ($len / 4) - 1; //there will be a lead and tail space - $max = $guess[$locale_tag]; - if ($num_points >= 0 ) { - foreach ($guess as $tag => $cnt) { - if ($cnt >= $num_points && $cnt > $max) { - $locale_tag = $tag; - $max = $cnt; - break; - } - } - } - if ($locale_tag == 'en-US') { - $len = strlen($original_phrase_string); - $locale_tag = 'en-US'; - foreach (['en-US', 'fr-FR', 'es', 'it'] as $lang) { - $tokenizer = PhraseParser::getTokenizer($lang); + foreach (['ar', 'bn', 'de', 'en-US', 'es', 'fa', 'fr-FR', 'he', 'hi', + 'in-ID', 'it', 'ja', 'kn', 'ko', 'nl', 'pl', 'pt', 'ru', 'te', 'th', + 'vi-VN', 'zh-CN'] as $lang) { + $tokenizer = PhraseParser::getTokenizer($lang); + if ($tokenizer) { $test_len = - strlen($tokenizer->stopwordsRemover($original_phrase_string)); + strlen($tokenizer->stopwordsRemover($phrase_string)); if ($test_len < $len) { $len = $test_len; $locale_tag = $lang; diff --git a/src/locale/ar/configure.ini b/src/locale/ar/configure.ini index d2b08b6c1..29ca71c02 100755 --- a/src/locale/ar/configure.ini +++ b/src/locale/ar/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "بي إتش بي محرك البحث-يوب!: %s" rss_layout_description = "نتائج البحث ل: %s" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/bn/configure.ini b/src/locale/bn/configure.ini index a8ecef528..d1316a74b 100755 --- a/src/locale/bn/configure.ini +++ b/src/locale/bn/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "" rss_layout_description = "" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/bn/resources/Tokenizer.php b/src/locale/bn/resources/Tokenizer.php index 650e1ee60..ac6bcbe1b 100755 --- a/src/locale/bn/resources/Tokenizer.php +++ b/src/locale/bn/resources/Tokenizer.php @@ -37,9 +37,44 @@ namespace seekquarry\yioop\locale\bn\resources; */ class Tokenizer { + /** + * A list of frequently occurring terms for this locale which should + * be excluded from certain kinds of queries. This is also used + * for language detection + * @array + */ + public static $stop_words = ['হিসাবে', 'আমি', 'তার', 'যে', 'তিনি', 'ছিল', + 'জন্য', 'উপর', 'হয়', 'সঙ্গে', 'তারা', 'হতে', 'এ', 'এক', 'আছে', 'এই', + 'থেকে', 'দ্বারা', 'গরম', 'শব্দ', 'কিন্তু', 'কি', 'কিছু', 'হয়', 'এটা', 'আপনি', + 'বা', 'ছিল', 'দী', 'এর', 'থেকে', 'এবং', 'একটি', 'মধ্যে', 'আমরা', 'করতে', + 'পারেন', 'আউট', 'অন্যান্য', 'ছিল', 'যা', 'কি', 'তাদের', 'সময়', 'যদি', + 'অভিলাষ', 'কিভাবে', 'তিনি বলেন,', 'একটি', 'প্রতিটি', 'বলুন', 'না', 'সেট', + 'তিন', 'চান', 'বায়ু', 'ভাল', 'এছাড়াও', 'খেলা', 'ছোট', 'শেষ', 'করা', 'হোম', + 'পড়া', 'হাত', 'পোর্ট', 'বড়', 'বানান', 'যোগ করা', 'এমনকি', 'জমি', 'এখানে', + 'অবশ্যই', 'বড়', 'উচ্চ', 'এমন', 'অনুসরণ করা', 'আইন', 'কেন', 'জিজ্ঞাসা', + 'পুরুষ', 'পরিবর্তন', 'গিয়েছিলাম', 'আলো', 'ধরনের', 'বন্ধ', 'প্রয়োজন', 'ঘর', + 'ছবি', 'চেষ্টা', 'আমাদের', 'আবার', 'পশু', 'বিন্দু', 'মা', 'বিশ্বের', 'কাছাকাছি', + 'নির্মাণ', 'স্ব', 'পৃথিবী', 'বাবা']; /** * How many characters in a char gram for this locale * @var int */ public static $char_gram_len = 5; + /** + * Removes the stop words from the page (used for Word Cloud generation + * and language detection) + * + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words + */ + public static function stopwordsRemover($data) + { + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; + } } diff --git a/src/locale/de/configure.ini b/src/locale/de/configure.ini index 18b2a829f..a5e1c36c3 100755 --- a/src/locale/de/configure.ini +++ b/src/locale/de/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "" rss_layout_description = "" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/de/resources/Tokenizer.php b/src/locale/de/resources/Tokenizer.php index 924649332..a30d2731b 100755 --- a/src/locale/de/resources/Tokenizer.php +++ b/src/locale/de/resources/Tokenizer.php @@ -56,7 +56,8 @@ class Tokenizer public static $no_stem_list =["titanic"]; /** * A list of frequently occurring terms for this locale which should - * be excluded from certain kinds of queries + * be excluded from certain kinds of queries. This is also used + * for language detection * @array */ public static $stop_words = ['aber', 'alle', 'allem', 'allen', 'aller', @@ -150,7 +151,8 @@ class Tokenizer return $pre_segment; } /** - * Removes the stop words from the page (used for Word Cloud generation) + * Removes the stop words from the page (used for Word Cloud generation + * and language detection) * * @param mixed $data either a string or an array of string to remove * stop words from diff --git a/src/locale/en_US/configure.ini b/src/locale/en_US/configure.ini index 283fadf7c..895b970db 100644 --- a/src/locale/en_US/configure.ini +++ b/src/locale/en_US/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "PHP Search Engine - Yioop! : %s" rss_layout_description = "Search results for: %s" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "Yioop" ; ; WikiView.php diff --git a/src/locale/es/configure.ini b/src/locale/es/configure.ini index f8b8f46b8..d72298972 100755 --- a/src/locale/es/configure.ini +++ b/src/locale/es/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "" rss_layout_description = "" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/fa/configure.ini b/src/locale/fa/configure.ini index 66dd15532..3cf6077a2 100755 --- a/src/locale/fa/configure.ini +++ b/src/locale/fa/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "موتور جستجوی PHP - Yioop! : %s" rss_layout_description = "نتایج جستجو برای: %s" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/fr_FR/configure.ini b/src/locale/fr_FR/configure.ini index 0854d61ab..adaaffe36 100755 --- a/src/locale/fr_FR/configure.ini +++ b/src/locale/fr_FR/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "Moteur de recherche PHP -Yioop! %s" rss_layout_description = "%s Résultats" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/he/configure.ini b/src/locale/he/configure.ini index 2b63603a5..0f71065c5 100755 --- a/src/locale/he/configure.ini +++ b/src/locale/he/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "" rss_layout_description = "" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/he/resources/Tokenizer.php b/src/locale/he/resources/Tokenizer.php index f382fe933..e3a1a5502 100755 --- a/src/locale/he/resources/Tokenizer.php +++ b/src/locale/he/resources/Tokenizer.php @@ -37,9 +37,53 @@ namespace seekquarry\yioop\locale\he\resources; */ class Tokenizer { + /** + * A list of frequently occurring terms for this locale which should + * be excluded from certain kinds of queries. This is also used + * for language detection + * @array + */ + public static $stop_words = ['כמו', 'אני', 'שלו', 'ש', 'הוא', + 'היה', 'עבור', 'על', 'הם', 'עם', 'הם', + 'להיות', 'ב', 'אחד', 'יש לי', 'זה', 'מ', 'על ידי', + 'חם', 'מילה', 'אבל', 'מה', 'כמה', 'הוא', 'זה', + 'אתה', 'או', 'היה לי', 'עבור', 'של', 'אל', + 'ו', 'זמן', 'ב', 'אנחנו', 'יכול', + 'את', 'אחר', 'היו', 'ש', 'לעשות', + 'שלהם', 'זמן', 'אם', 'יהיה', 'איך', + 'אמר', 'בית', 'כל', 'לספר', 'עושה', + 'סט', 'שלוש', 'רוצה', 'אוויר', 'גם', + 'גם', 'לשחק', 'קטן', + 'סוף', 'לשים', 'בית', 'לקרוא', 'יד', 'נמל', 'גדול', + 'לאיית', 'להוסיף', 'אפילו', 'ארץ', + 'כאן', 'חייב', 'גדול', 'גבוה', + 'כזה', 'מעקב', 'מעשה', 'מדוע', + 'שואל', 'אנשים', 'לשנות', 'הלכתי', + 'אור', 'סוג', 'את', 'צריך', + 'בית', 'תמונה', 'לנסות', 'שלנו', + 'שוב', 'חיה', 'נקודה', + 'אמא', 'עולם', + 'ליד', 'לבנות', 'עצמי', 'כדור הארץ', 'אב']; /** * How many characters in a char gram for this locale * @var int */ public static $char_gram_len = 5; + /** + * Removes the stop words from the page (used for Word Cloud generation + * and language detection) + * + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words + */ + public static function stopwordsRemover($data) + { + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; + } } diff --git a/src/locale/hi/configure.ini b/src/locale/hi/configure.ini index 56b381f2f..c3b7f9edc 100755 --- a/src/locale/hi/configure.ini +++ b/src/locale/hi/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "" rss_layout_description = "" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/hi/resources/Tokenizer.php b/src/locale/hi/resources/Tokenizer.php index 846c070aa..a4b1001b6 100755 --- a/src/locale/hi/resources/Tokenizer.php +++ b/src/locale/hi/resources/Tokenizer.php @@ -43,6 +43,23 @@ use seekquarry\yioop\configs as C; */ class Tokenizer { + /** + * A list of frequently occurring terms for this locale which should + * be excluded from certain kinds of queries. This is also used + * for language detection + * @array + */ + public static $stop_words = ['जैसा', 'मैं', 'उसके', 'कि', 'वह', 'था', 'के', + 'लिए', 'पर', 'हैं', 'साथ', 'वे', 'हो', 'पर', 'एक', 'है', 'इस', 'से', 'द्वारा', + 'गरम', 'शब्द', 'लेकिन', 'क्या', 'कुछ', 'है', 'यह', 'आप', 'या', 'था', 'की', + 'तक', 'और', 'एक', 'में', 'हम', 'कर', 'सकते', 'हैं', 'बाहर', 'अन्य', 'थे', 'जो', + 'कर', 'उनके', 'समय', 'अगर', 'होगा', 'कैसे', 'कहा', 'एक', 'प्रत्येक', 'बता', + 'करता', 'है', 'सेट', 'तीन', 'चाहते हैं', 'हवा', 'अच्छी तरह से', 'भी', 'खेलने', + 'छोटे', 'अंत', 'डाल', 'घर', 'पढ़ा', 'हाथ', 'बंदरगाह', 'बड़ा', 'जादू', 'जोड़', + 'और', 'भी', 'भूमि', 'यहाँ', 'चाहिए', 'बड़ा', 'उच्च', 'ऐसा', 'का', 'पालन', 'करें', + 'अधिनियम', 'क्यों', 'पूछना', 'पुरुषों', 'परिवर्तन', 'चला', 'गया', 'प्रकाश', 'तरह', + 'बंद', 'आवश्यकता', 'घर', 'तस्वीर', 'कोशिश', 'हमें', 'फिर', 'पशु', 'बिंदु', 'मां', + 'दुनिया', 'निकट', 'बनाना', 'आत्म', 'पृथ्वी', 'पिता']; /** * List of verb-like parts of speech that might appear in lexicon * @var array @@ -82,6 +99,23 @@ class Tokenizer * @var array */ public static $no_stem_list = []; + /** + * Removes the stop words from the page (used for Word Cloud generation + * and language detection) + * + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words + */ + public static function stopwordsRemover($data) + { + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; + } /** * Stub function which could be used for a word segmenter. * Such a segmenter on input thisisabunchofwords would output diff --git a/src/locale/in_ID/configure.ini b/src/locale/in_ID/configure.ini index 8f4b86bb1..3aa8630a7 100755 --- a/src/locale/in_ID/configure.ini +++ b/src/locale/in_ID/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "" rss_layout_description = "" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/in_ID/resources/Tokenizer.php b/src/locale/in_ID/resources/Tokenizer.php index 372692a5f..5b4fc4abd 100755 --- a/src/locale/in_ID/resources/Tokenizer.php +++ b/src/locale/in_ID/resources/Tokenizer.php @@ -40,9 +40,47 @@ use seekquarry\yioop\models\LocaleModel; */ class Tokenizer { + /** + * A list of frequently occurring terms for this locale which should + * be excluded from certain kinds of queries. This is also used + * for language detection + * @array + */ + public static $stop_words = ['sebagai', 'saya', 'nya', 'bahwa', 'dia', + 'adalah', 'untuk', 'pada', 'adalah', 'dengan', 'mereka', 'menjadi', + 'di', 'satu', 'memiliki', 'ini', 'dari', 'oleh', 'hot', 'kata', + 'tapi', 'apa', 'beberapa', 'adalah', 'itu', 'anda', 'atau', + 'memiliki', 'itu', 'dari', 'untuk', 'dan', 'sebuah', 'di', 'kami', + 'bisa', 'out', 'lainnya', 'yang', 'yang', 'melakukan', 'mereka', + 'waktu', 'jika', 'akan', 'bagaimana', 'kata', 'an', 'masing-masing', + 'memberitahu', 'tidak', 'Kumpulan', 'tiga', 'ingin', 'udara', 'baik', + 'juga', 'bermain', 'kecil', 'end', 'menempatkan', 'rumah', 'baca', + 'tangan', 'pelabuhan', 'besar', 'mantra', 'tambahkan', 'bahkan', + 'tanah', 'di sini', 'harus', 'besar', 'tinggi', 'seperti', 'ikuti', + 'tindakan', 'mengapa', 'bertanya', 'laki-laki', 'perubahan', 'pergi', + 'cahaya', 'jenis', 'off', 'perlu', 'rumah', 'gambar', 'coba', 'kami', + 'lagi', 'hewan', 'titik', 'ibu', 'dunia', 'dekat', 'membangun', + 'diri', 'bumi', 'ayah']; /** * How many characters in a char gram for this locale * @var int */ public static $char_gram_len = 5; + /** + * Removes the stop words from the page (used for Word Cloud generation + * and language detection) + * + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words + */ + public static function stopwordsRemover($data) + { + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; + } } diff --git a/src/locale/it/configure.ini b/src/locale/it/configure.ini index 7e9f9f8a9..193e80e11 100755 --- a/src/locale/it/configure.ini +++ b/src/locale/it/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "Yioop! Motore di Ricerca in PHP: %s" rss_layout_description = "Risultati di ricerca per: %s" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/ja/configure.ini b/src/locale/ja/configure.ini index a8d958685..94bb069e3 100755 --- a/src/locale/ja/configure.ini +++ b/src/locale/ja/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "" rss_layout_description = "" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/ja/resources/Tokenizer.php b/src/locale/ja/resources/Tokenizer.php index ad0faa2e2..04cd56333 100755 --- a/src/locale/ja/resources/Tokenizer.php +++ b/src/locale/ja/resources/Tokenizer.php @@ -37,10 +37,45 @@ namespace seekquarry\yioop\locale\ja\resources; */ class Tokenizer { + /** + * A list of frequently occurring terms for this locale which should + * be excluded from certain kinds of queries. This is also used + * for language detection + * @array + */ + public static $stop_words = ['ように', '私は', '彼の', 'その', '彼', + 'た', 'ために', '上の', 'アール', 'とともに', '彼ら', 'ある', 'アット', + '一つ', '持っている', 'この', 'から', 'バイ', 'ホット', '言葉', 'しかし', '何', + 'いくつかの', 'です', 'それ', 'あなた', 'または', '持っていた', 'インクルード', 'の', + 'へ', 'そして', 'は', 'で', '我々', '缶', 'アウト', 'その他', 'だった', + 'これ', 'やる', 'それらの', '時間', 'もし', '意志', '方法', '前記', 'の', + 'それぞれ', '言う', 'し', 'セット', '個', '欲しい', '空気', 'よく', + 'また', '遊ぶ', '小さい', '終わり', '置く', 'ホーム', '読む', '手', + 'ポート', '大きい', 'スペル', '加える', 'さらに', '土地', 'ここに', + 'しなければならない', '大きい', '高い', 'そのような', '続く', '行為', + 'なぜ', '頼む', '人々', '変更', '行ってきました', '光', '種類', 'オフ', + '必要', '家', '絵', '試す', '私たち', '再び', '動物', 'ポイント', '母', + '世界', '近く', 'ビルド', '自己', '地球', '父']; /** * How many characters in a char gram for this locale * @var int */ public static $char_gram_len = 3; + /** + * Removes the stop words from the page (used for Word Cloud generation + * and language detection) + * + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words + */ + public static function stopwordsRemover($data) + { + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/(' . implode('|', self::$stop_words) . ')/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; + } } - diff --git a/src/locale/kn/configure.ini b/src/locale/kn/configure.ini index 5898ce08d..540e943fb 100755 --- a/src/locale/kn/configure.ini +++ b/src/locale/kn/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "ಪಿಹೆಚಪಿ ಶೋಧನಾ ಯಂತ್ರ - ಯ rss_layout_description = "ಈ ಶೋಧನಾ ಫಲಿತಾಂಶಗಳು ನಿಮ್ಮ ಪ್ರಶ್ನೆ: %s ಯ ಉತ್ತರ" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/kn/resources/Tokenizer.php b/src/locale/kn/resources/Tokenizer.php index d60577de5..1be3d1f65 100755 --- a/src/locale/kn/resources/Tokenizer.php +++ b/src/locale/kn/resources/Tokenizer.php @@ -37,9 +37,45 @@ namespace seekquarry\yioop\locale\kn\resources; */ class Tokenizer { + /** + * A list of frequently occurring terms for this locale which should + * be excluded from certain kinds of queries. This is also used + * for language detection + * @array + */ + public static $stop_words = ['ಮಾಹಿತಿ', 'ನಾನು', 'ಅವರ', 'ಆ', 'ಅವರು', + 'ಆಗಿತ್ತು', 'ಫಾರ್', 'ಮೇಲೆ', 'ಇವೆ', 'ಜೊತೆ', 'ಅವರು', 'ಎಂದು', 'ನಲ್ಲಿ', 'ಒಂದು', + 'ಹೊಂದಿವೆ', 'ಈ', 'ರಿಂದ', 'ಮೂಲಕ', 'ಬಿಸಿ', 'ಪದ', 'ಆದರೆ', 'ಏನು', 'ಕೆಲವು', + 'ಆಗಿದೆ', 'ಇದು', 'ನೀವು', 'ಅಥವಾ', 'ಹೊಂದಿತ್ತು', 'ದಿ', 'ನ', 'ಗೆ', 'ಮತ್ತು', + 'ಒಂದು', 'ರಲ್ಲಿ', 'ನಾವು', 'ಮಾಡಬಹುದು', 'ಔಟ್', 'ಇತರ', 'ಎಂದು', 'ಇದು', + 'ಹಾಗೆ', 'ತಮ್ಮ', 'ಸಮಯ', 'ವೇಳೆ', 'ತಿನ್ನುವೆ', 'ಹೇಗೆ', 'ಹೇಳಿದರು', 'ಒಂದು', + 'ಪ್ರತಿ', 'ಹೇಳಲು', 'ಮಾಡುತ್ತದೆ', 'ಸೆಟ್', 'ಮೂರು', 'ಬಯಸುವ', 'ಗಾಳಿ', 'ಹಾಗೂ', + 'ಸಹ', 'ಆಡಲು', 'ಸಣ್ಣ', 'ಕೊನೆಯಲ್ಲಿ', 'ಪುಟ್', 'ಮನೆ', 'ಓದಲು', 'ಕೈ', 'ಬಂದರು', + 'ದೊಡ್ಡ', 'ಕಾಗುಣಿತ', 'ಸೇರಿಸಬಹುದು', 'ಸಹ', 'ಭೂಮಿ', 'ಇಲ್ಲಿ', 'ಮಾಡಬೇಕಾಗುತ್ತದೆ', + 'ದೊಡ್ಡ', 'ಹೆಚ್ಚಿನ', 'ಇಂತಹ', 'ಅನುಸರಿಸಿ', 'ಆಕ್ಟ್', 'ಏಕೆ', 'ಕೇಳಿ', 'ಪುರುಷರು', + 'ಬದಲಾವಣೆ', 'ಹೋದರು', 'ಬೆಳಕಿನ', 'ರೀತಿಯ', 'ಆಫ್', 'ಅಗತ್ಯವಿದೆ', 'ಮನೆ', 'ಚಿತ್ರ', + 'ಪ್ರಯತ್ನಿಸಿ', 'ನಮಗೆ', 'ಮತ್ತೆ', 'ಪ್ರಾಣಿ', 'ಪಾಯಿಂಟ್', 'ತಾಯಿ', 'ವಿಶ್ವದ', 'ಬಳಿ', + 'ನಿರ್ಮಿಸಲು', 'ಸ್ವಯಂ', 'ಭೂಮಿಯ', 'ತಂದೆ']; /** * How many characters in a char gram for this locale * @var int */ public static $char_gram_len = 5; + /** + * Removes the stop words from the page (used for Word Cloud generation + * and language detection) + * + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words + */ + public static function stopwordsRemover($data) + { + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; + } } diff --git a/src/locale/ko/configure.ini b/src/locale/ko/configure.ini index 028e5e469..e505c6ac7 100755 --- a/src/locale/ko/configure.ini +++ b/src/locale/ko/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "PHP 검색 엔진 - Yioop! : %s" rss_layout_description = "%s 에 대한 서치 결과:" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/ko/resources/Tokenizer.php b/src/locale/ko/resources/Tokenizer.php index 74a23d38e..776813f51 100755 --- a/src/locale/ko/resources/Tokenizer.php +++ b/src/locale/ko/resources/Tokenizer.php @@ -39,9 +39,43 @@ use seekquarry\yioop\models\LocaleModel; */ class Tokenizer { + /** + * A list of frequently occurring terms for this locale which should + * be excluded from certain kinds of queries. This is also used + * for language detection + * @array + */ + public static $stop_words = ['로', '나는', '그의', '그', '그', '했다', + '에 대한', '에', '아르', '와', '그들', '있다', '에', '일', '이', '이', + '부터', '에 의해', '뜨거운', '단어', '하지만', '무엇', '다소', '이다', '그', + '당신', '또는', '했다', '에', '의', '에', '과', '이', '에', '우리', '수', + '아웃', '다른', '했다', '하는', '할', '자신의', '시간', '면', '것', '방법', + '말했다', '이', '각', '이야기', '하지', '세트', '세', '필요', '공기', '잘', + '또한', '재생', '작은', '끝', '넣어', '홈', '읽기', '손', '포트', '큰', + '철자', '추가', '도', '땅', '여기', '해야', '큰', '높은', '이러한', '따라', + '행위', '이유', '문의', '남자', '변경', '갔다', '빛', '종류', '오프', + '필요가있다', '집', '사진', '시험', '우리', '다시', '동물', '포인트', + '어머니', '세계', '가까운', '구축', '자기', '지구', '아버지']; /** * How many characters in a char gram for this locale * @var int */ public static $char_gram_len = 3; + /** + * Removes the stop words from the page (used for Word Cloud generation + * and language detection) + * + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words + */ + public static function stopwordsRemover($data) + { + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/(' . implode('|', self::$stop_words) . ')/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; + } } diff --git a/src/locale/nl/configure.ini b/src/locale/nl/configure.ini index 285828dd8..52b2650d3 100644 --- a/src/locale/nl/configure.ini +++ b/src/locale/nl/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "PHP Search Engine - Yioop! :%S" rss_layout_description = "Zoek resultaten voor: %s" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "Yioop" ; ; WikiView.php diff --git a/src/locale/nl/resources/Tokenizer.php b/src/locale/nl/resources/Tokenizer.php index 082d85833..f39dfedb4 100755 --- a/src/locale/nl/resources/Tokenizer.php +++ b/src/locale/nl/resources/Tokenizer.php @@ -36,6 +36,26 @@ namespace seekquarry\yioop\locale\nl\resources; */ class Tokenizer { + /** + * A list of frequently occurring terms for this locale which should + * be excluded from certain kinds of queries. This is also used + * for language detection + * @array + */ + public static $stop_words = ['als', 'I', 'zijn', 'dat', 'hij', 'was', + 'voor', 'op', 'zijn', 'met', 'ze', 'zijn', 'bij', 'een', 'hebben', + 'deze', 'van', 'door', 'heet', 'woord', 'maar', 'wat', 'sommige', + 'is', 'het', 'u', 'of', 'had', 'de', 'van', 'aan', 'en', 'een', + 'in', 'we', 'kan', 'uit', 'andere', 'waren', 'die', 'doen', 'hun', + 'tijd', 'indien', 'zal', 'hoe', 'zei', 'een', 'elk', 'vertellen', + 'doet', 'set', 'drie', 'willen', 'lucht', 'goed', 'ook', 'spelen', + 'klein', 'end', 'zetten', 'thuis', 'lezen', 'de hand', 'poort', + 'grote', 'spell', 'toevoegen', 'zelfs', 'land', 'hier', 'moet', + 'grote', 'hoog', 'dergelijke', 'volgen', 'act', 'waarom', 'vragen', + 'mannen', 'verandering', 'ging', 'licht', 'soort', 'uitgeschakeld', + 'nodig', 'huis', 'afbeelding', 'proberen', 'ons', 'weer', 'dier', + 'punt', 'moeder', 'wereld', 'dichtbij', 'bouwen', 'zelf', 'aarde', + 'vader']; /** * Words we don't want to be stemmed * @var array @@ -52,6 +72,23 @@ class Tokenizer "sme", "spe", "ste", "the", "tje", "uce", "uden", "uien", "uren", "use", "uwe", "vse", "ype" ]; + /** + * Removes the stop words from the page (used for Word Cloud generation + * and language detection) + * + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words + */ + public static function stopwordsRemover($data) + { + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; + } /** * Stub function which could be used for a word segmenter. * Such a segmenter on input thisisabunchofwords would output diff --git a/src/locale/pl/configure.ini b/src/locale/pl/configure.ini index 1288a8cc8..ab35706c7 100755 --- a/src/locale/pl/configure.ini +++ b/src/locale/pl/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "" rss_layout_description = "" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/pl/resources/Tokenizer.php b/src/locale/pl/resources/Tokenizer.php index e27e344cf..6f4ed36a9 100755 --- a/src/locale/pl/resources/Tokenizer.php +++ b/src/locale/pl/resources/Tokenizer.php @@ -37,9 +37,46 @@ namespace seekquarry\yioop\locale\pl\resources; */ class Tokenizer { + /** + * A list of frequently occurring terms for this locale which should + * be excluded from certain kinds of queries. This is also used + * for language detection + * @array + */ + public static $stop_words = ['jak', 'I', 'jego', 'że', 'on', 'było', 'dla', + 'na', 'są', 'zespół', 'oni', 'być', 'w', 'jeden', 'mieć', 'tego', 'z', + 'przez', 'gorący', 'słowo', 'ale', 'co', 'niektóre', 'jest', 'to', + 'ty', 'lub', 'miał', 'kilka', 'stopa', 'do', 'i', 'ciągnąć', 'w', + 'my', 'puszka', 'na zewnątrz', 'inne', 'były', 'który', 'zrobić', + 'ich', 'czas', 'jeśli', 'będzie', 'jak', 'powiedział', 'próba', + 'każda', 'powiedzieć', 'nie', 'zestaw', 'trzy', 'chcą', 'powietrze', + 'dobrze', 'również', 'grać', 'mały', 'koniec', 'wkładać', + 'Strona', 'główna', 'czytaj', 'ręka', 'port', 'duży', 'zaklęcie', + 'dodać', 'nawet', 'ziemia', 'tutaj', 'musi', 'duży', 'wysoki', + 'takie', 'śledzić', 'akt', 'dlaczego', 'zapytaj', 'mężczyźni', + 'zmiana', 'poszedł', 'światła', 'rodzaj', 'z', 'potrzeba', 'dom', + 'obraz', 'spróbuj', 'nas', 'ponownie', 'zwierząt', 'punkt', 'matka', + 'świat', 'blisko', 'budować', 'własny', 'ziemia', 'ojciec']; /** * How many characters in a char gram for this locale * @var int */ public static $char_gram_len = 5; + /** + * Removes the stop words from the page (used for Word Cloud generation + * and language detection) + * + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words + */ + public static function stopwordsRemover($data) + { + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; + } } diff --git a/src/locale/pt/configure.ini b/src/locale/pt/configure.ini index 6328b4f8d..986c5bfb6 100755 --- a/src/locale/pt/configure.ini +++ b/src/locale/pt/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "" rss_layout_description = "" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/pt/resources/Tokenizer.php b/src/locale/pt/resources/Tokenizer.php index df73e52bf..10a3cf63f 100755 --- a/src/locale/pt/resources/Tokenizer.php +++ b/src/locale/pt/resources/Tokenizer.php @@ -38,6 +38,27 @@ namespace seekquarry\yioop\locale\pt\resources; */ class Tokenizer { + /** + * A list of frequently occurring terms for this locale which should + * be excluded from certain kinds of queries. This is also used + * for language detection + * @array + */ + public static $stop_words = ['como', 'I', 'seu', 'ele', 'foi', 'para', 'em', + 'são', 'com', 'eles', 'ser', 'em', 'uma', 'tem', 'este', + 'partir', 'de', 'por', 'quente', 'palavra', 'mas', + 'que', 'alguns', 'é', 'ele', 'você', 'ou', 'teve', + 'o', 'a', 'e', 'uma', 'em', 'nós', 'lata', 'fora', + 'outro', 'foram', 'que', 'fazer', 'seu', 'tempo', 'se', + 'vontade', 'como', 'disse', 'uma', 'cada', 'dizer', 'faz', + 'conjunto', 'três', 'quer', 'ar', 'bem', 'também', 'jogar', + 'pequeno', 'fim', 'colocar', 'casa', 'ler', 'mão', 'port', + 'grande', 'soletrar', 'adicionar', 'mesmo', 'terra', 'aqui', + 'necessário', 'grande', 'alto', 'tais', 'siga', 'ato', + 'perguntar', 'homens', 'mudança', 'fui', 'luz', + 'tipo', 'off', 'precisa', 'casa', 'imagem', 'tentar', 'nós', + 'novamente', 'animais', 'ponto', 'mãe', 'mundo', 'perto', + 'construir', 'auto', 'terra', 'pai']; /** * Phrases we would like yioop to rewrite before performing a query * @var array @@ -75,6 +96,23 @@ class Tokenizer * @var string */ private static $rv = ""; + /** + * Removes the stop words from the page (used for Word Cloud generation + * and language detection) + * + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words + */ + public static function stopwordsRemover($data) + { + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; + } /** * Stub function which could be used for a word segmenter. * Such a segmenter on input thisisabunchofwords would output diff --git a/src/locale/ru/configure.ini b/src/locale/ru/configure.ini index 976db947b..2c0ac78d6 100755 --- a/src/locale/ru/configure.ini +++ b/src/locale/ru/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "" rss_layout_description = "" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/te/configure.ini b/src/locale/te/configure.ini index cca42f7bd..6888ac855 100644 --- a/src/locale/te/configure.ini +++ b/src/locale/te/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "" rss_layout_description = "" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "Yioop" ; ; WikiView.php diff --git a/src/locale/te/resources/Tokenizer.php b/src/locale/te/resources/Tokenizer.php index 73f23b363..d38b26287 100755 --- a/src/locale/te/resources/Tokenizer.php +++ b/src/locale/te/resources/Tokenizer.php @@ -39,9 +39,45 @@ use seekquarry\yioop\models\LocaleModel; */ class Tokenizer { + /** + * A list of frequently occurring terms for this locale which should + * be excluded from certain kinds of queries. This is also used + * for language detection + * @array + */ + public static $stop_words = ['గా', 'నేను', 'తన', 'ఆ', 'అతను', 'ఉంది', + 'కోసం', 'న', 'ఉన్నాయి', 'తో', 'వారు', 'ఉంటుంది', 'వద్ద', 'ఒకటి', 'కలిగి', + 'ఈ', 'నుండి', 'ద్వారా', 'వేడి', 'పదం', 'కానీ', 'ఏమి', 'కొన్ని', 'ఉంది', + 'ఇది', 'మీరు', 'లేదా', 'వచ్చింది', 'ది', 'యొక్క', 'కు', 'మరియు', 'ఒక', + 'లో', 'మేము', 'చెయ్యవచ్చు', 'అవుట్', 'ఇతర', 'ఉన్నాయి', 'ఇది', 'చేయండి', + 'వారి', 'సమయం', 'ఉంటే', 'రెడీ', 'ఎలా', 'అన్నాడు', 'ఒక', 'ప్రతి', 'చెప్పండి', + 'చేస్తుంది', 'సెట్', 'మూడు', 'కావలసిన', 'గాలి', 'బాగా', 'కూడా', 'ప్లే', + 'చిన్న', 'ముగింపు', 'చాలు', 'హోమ్', 'చదవడానికి', 'చేతి', 'పోర్ట్', 'పెద్ద', + 'అక్షరక్రమ', 'జోడించండి', 'కూడా', 'భూమి', 'ఇక్కడ', 'తప్పక', 'పెద్ద', 'అధిక', + 'ఇటువంటి', 'అనుసరించండి', 'చట్టం', 'ఎందుకు', 'గోవా', 'పురుషులు', 'మార్పు', + 'వెళ్ళింది', 'కాంతి', 'రకం', 'ఆఫ్', 'అవసరం', 'ఇల్లు', 'చిత్రాన్ని', 'ప్రయత్నించండి', + 'మాకు', 'మళ్ళీ', 'జంతు', 'పాయింట్', 'తల్లి', 'ప్రపంచ', 'సమీపంలో', + 'నిర్మించడానికి', 'స్వీయ', 'భూమి', 'తండ్రి']; /** * How many characters in a char gram for this locale * @var int */ public static $char_gram_len = 5; + /** + * Removes the stop words from the page (used for Word Cloud generation + * and language detection) + * + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words + */ + public static function stopwordsRemover($data) + { + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; + } } diff --git a/src/locale/th/configure.ini b/src/locale/th/configure.ini index 8014cab32..0f5e3dde1 100755 --- a/src/locale/th/configure.ini +++ b/src/locale/th/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "" rss_layout_description = "" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/th/resources/Tokenizer.php b/src/locale/th/resources/Tokenizer.php index fd9f2d154..d5d0734f8 100755 --- a/src/locale/th/resources/Tokenizer.php +++ b/src/locale/th/resources/Tokenizer.php @@ -37,9 +37,44 @@ namespace seekquarry\yioop\locale\th\resources; */ class Tokenizer { + /** + * A list of frequently occurring terms for this locale which should + * be excluded from certain kinds of queries. This is also used + * for language detection + * @array + */ + public static $stop_words = ['เป็น', 'ฉัน', 'ของเขา', 'ว่า', 'เขา', 'เป็น', + 'สำหรับ', 'บน', 'มี', 'ด้วย', 'พวกเขา', 'จะเป็น', 'ที่', 'หนึ่ง', 'มี', + 'นี้', 'จาก', 'โดย', 'ร้อน', 'คำ', 'แต่', 'สิ่งที่', 'บาง', 'เป็น', 'มัน', + 'คุณ', 'หรือ', 'มี', 'ได้', 'ของ', 'ที่จะ', 'และ', 'กรอก', 'ใน', 'เรา', + 'สามารถ', 'ออก', 'อื่น ๆ', 'เป็น', 'ซึ่ง', 'ทำ', 'ของพวกเขา', 'เวลา', + 'ถ้า', 'จะ', 'วิธี', 'กล่าวว่า', 'บิน', 'แต่ละ', 'บอก', 'ไม่', 'ชุด', 'สาม', + 'ต้องการ', 'อากาศ', 'ดี', 'ด้วย', 'เล่น', 'ขนาดเล็ก', 'ปลาย', 'ใส่', 'บ้าน', + 'อ่าน', 'มือ', 'พอร์ต', 'ที่มีขนาดใหญ่', 'สะกด', 'เพิ่ม', 'แม้กระทั่ง', 'ที่ดิน', + 'ที่นี่', 'ต้อง', 'ใหญ่', 'สูง', 'เช่น', 'ทำตาม', 'การกระทำ', 'เหตุผลที่', + 'ขอให้', 'ผู้ชาย', 'การเปลี่ยนแปลง', 'ไป', 'แสง', 'ชนิด', 'ออก', 'ต้อง', + 'บ้าน', 'ภาพ', 'พยายาม', 'เรา', 'อีกครั้ง', 'สัตว์', 'จุด', 'แม่', 'โลก', + 'อยู่ใกล้', 'สร้าง', 'ตนเอง', 'โลก', 'พ่อของ']; /** * How many characters in a char gram for this locale * @var int */ public static $char_gram_len = 5; + /** + * Removes the stop words from the page (used for Word Cloud generation + * and language detection) + * + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words + */ + public static function stopwordsRemover($data) + { + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; + } } diff --git a/src/locale/tr/configure.ini b/src/locale/tr/configure.ini index d2ee070a2..30ca2e314 100755 --- a/src/locale/tr/configure.ini +++ b/src/locale/tr/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "" rss_layout_description = "" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/tr/resources/Tokenizer.php b/src/locale/tr/resources/Tokenizer.php index 73cbd297e..16f4d7eaf 100755 --- a/src/locale/tr/resources/Tokenizer.php +++ b/src/locale/tr/resources/Tokenizer.php @@ -38,9 +38,46 @@ namespace seekquarry\yioop\locale\tr\resources; */ class Tokenizer { + /** + * A list of frequently occurring terms for this locale which should + * be excluded from certain kinds of queries. This is also used + * for language detection + * @array + */ + public static $stop_words = ['olarak', 'ben', 'onun', 'bu', 'diye', + 'oldu', 'için', 'üzerinde', 'vardır', 'ile', 'onlar', 'olmak', 'at', + 'bir', 'var', 'Bu', 'dan', 'tarafından', 'sıcak', 'kelime', 'ancak', + 'ne', 'bazı', 'olduğunu', 'o', 'sen', 'veya', 'vardı', '', 'arasında', + 'karşı', 've', 'bir', 'içinde', 'biz', 'can', 'üzerinden', 'diğer', + 'vardı', 'hangi', 'do', 'onların', 'zaman', 'eğer', 'olacak', + 'nasıl', 'dedi', 'bir', 'her', 'söyle', 'yok', 'set', 'üç', + 'istiyorum', 'hava', 'iyi', 'ayrıca', 'oynamak', 'küçük', 'son', + 'koymak', 'ev', 'okumak', 'el', 'liman', 'büyük', 'büyü', 'ekleyin', + 'hatta', 'arazi', 'burada', 'gerekir', 'büyük', 'yüksek', 'böyle', + 'izleyin', 'hareket', 'neden', 'sormak', 'erkekler', 'değişim', + 'gitti', 'ışık', 'tür', 'kapalı', 'gerek', 'ev', 'resim', 'denemek', + 'bizi', 'tekrar', 'hayvan', 'nokta', 'anne', 'dünya', 'yakın', + 'inşa', 'etmek', 'öz', 'toprak', 'baba']; /** * How many characters in a char gram for this locale * @var int */ public static $char_gram_len = 5; + /** + * Removes the stop words from the page (used for Word Cloud generation + * and language detection) + * + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words + */ + public static function stopwordsRemover($data) + { + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; + } } diff --git a/src/locale/vi_VN/configure.ini b/src/locale/vi_VN/configure.ini index a01e2386b..313406dca 100755 --- a/src/locale/vi_VN/configure.ini +++ b/src/locale/vi_VN/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "" rss_layout_description = "" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/vi_VN/resources/Tokenizer.php b/src/locale/vi_VN/resources/Tokenizer.php index f6ee9cd39..9547380aa 100755 --- a/src/locale/vi_VN/resources/Tokenizer.php +++ b/src/locale/vi_VN/resources/Tokenizer.php @@ -39,5 +39,42 @@ namespace seekquarry\yioop\locale\vi_VN\resources; */ class Tokenizer { - -} \ No newline at end of file + /** + * A list of frequently occurring terms for this locale which should + * be excluded from certain kinds of queries. This is also used + * for language detection + * @array + */ + public static $stop_words = ['như', 'tôi', 'mình', 'mà', 'ông', 'là', + 'cho', 'trên', 'là', 'với', 'họ', 'được', 'tại', 'một', 'có', + 'này', 'từ', 'bởi', 'nóng', 'từ', 'nhưng', 'những', 'gì', 'một', + 'số', 'là', 'nó', 'anh', 'hoặc', 'có', 'các', 'của', 'để', 'và', + 'một', 'trong', 'chúng', 'tôi', 'có', 'thể', 'ra', 'khác', 'là', + 'mà', 'làm', 'của', 'họ', 'thời', 'gian', 'nếu', 'sẽ', 'như', 'thế', + 'nào', 'nói', 'một', 'môi', 'nói ', 'không', 'bộ', 'ba', 'muốn', + 'không', 'khí', 'cũng', 'cũng', 'chơi', 'nhỏ', 'cuố', 'đặt', 'nhà', + 'đọc', 'tay', 'cổng', 'lớn', 'chính', 'tả', 'thêm', 'thậm', 'chí', + 'đất', 'ở', 'đây', 'phải', 'lớn', 'cao', 'như', 'vậy', 'theo', + 'hành', 'động', 'lý', 'do ', 'tại ', 'sao', 'xin', 'người', 'đàn', + 'ông', 'thay', 'đổi', 'đi', 'ánh', 'sáng', 'loại', 'tắt', 'cần', 'nhà', + 'hình', 'ảnh', 'thử', 'chúng', 'tôi', 'một ', 'lần', 'nữa', 'động', + 'vật', 'điểm', 'mẹ', 'thế', 'giới', 'gần', 'xây', 'dựng', 'tự', 'đất', + 'cha']; + /** + * Removes the stop words from the page (used for Word Cloud generation + * and language detection) + * + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words + */ + public static function stopwordsRemover($data) + { + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; + } +} diff --git a/src/locale/zh_CN/configure.ini b/src/locale/zh_CN/configure.ini index f1355ae2d..7324865eb 100755 --- a/src/locale/zh_CN/configure.ini +++ b/src/locale/zh_CN/configure.ini @@ -1691,7 +1691,7 @@ rss_layout_title = "" rss_layout_description = "" ; ; View.php -view_locale_version = "3" +view_locale_version = "4" view_logo_alt_text = "" ; ; WikiView.php diff --git a/src/locale/zh_CN/resources/Tokenizer.php b/src/locale/zh_CN/resources/Tokenizer.php index 8202c25a0..c50dc568e 100755 --- a/src/locale/zh_CN/resources/Tokenizer.php +++ b/src/locale/zh_CN/resources/Tokenizer.php @@ -41,13 +41,37 @@ use seekquarry\yioop\library\PhraseParser; class Tokenizer { /** - * Removes the stop words from the page - * @param string $page the page to remove stop words from. - * @return string $page with no stop words + * A list of frequently occurring terms for this locale which should + * be excluded from certain kinds of queries. This is also used + * for language detection + * @array */ - public static function stopwordsRemover($page) + public static $stop_words = ['一', '人', '里', '会', '没', '她', '吗', '去', + '也', '有', '这', '那', '不', '什', '个', '来', '要', '就', '我', '你', + '的', '是', '了', '他', '么', '们', '在', '说', '为', '好', '吧', '知道', + '我的', '和', '你的', '想', '只', '很', '都', '对', '把', '啊', '怎', '得', + '还', '过', '不是', '到', '样', '飞', '远', '身', '任何', '生活', '够', + '号', '兰', '瑞', '达', '或', '愿', '蒂', '別', '军', '正', '是不是', + '证', '不用', '三', '乐', '吉', '男人', '告訴', '路', '搞', '可是', + '与', '次', '狗', '决', '金', '史', '姆', '部', '正在', '活', '刚', + '回家', '贝', '如何', '须', '战', '不會', '夫', '喂', '父', '亚', '肯定', + '女孩', '世界']; + /** + * Removes the stop words from the page (used for Word Cloud generation + * and language detection) + * + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words + */ + public static function stopwordsRemover($data) { - return $page; + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/(' . implode('|', self::$stop_words) . ')/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; } /** * A word segmenter.