addStopwordsRemover for all locales, use for language detection, =achris

Chris Pollett [2019-06-07 20:Jun:th]

addStopwordsRemover for all locales, use for language detection, =achris

Filename
src/configs/Config.php
src/index.php
src/library/LocaleFunctions.php
src/locale/ar/configure.ini
src/locale/bn/configure.ini
src/locale/bn/resources/Tokenizer.php
src/locale/de/configure.ini
src/locale/de/resources/Tokenizer.php
src/locale/en_US/configure.ini
src/locale/es/configure.ini
src/locale/fa/configure.ini
src/locale/fr_FR/configure.ini
src/locale/he/configure.ini
src/locale/he/resources/Tokenizer.php
src/locale/hi/configure.ini
src/locale/hi/resources/Tokenizer.php
src/locale/in_ID/configure.ini
src/locale/in_ID/resources/Tokenizer.php
src/locale/it/configure.ini
src/locale/ja/configure.ini
src/locale/ja/resources/Tokenizer.php
src/locale/kn/configure.ini
src/locale/kn/resources/Tokenizer.php
src/locale/ko/configure.ini
src/locale/ko/resources/Tokenizer.php
src/locale/nl/configure.ini
src/locale/nl/resources/Tokenizer.php
src/locale/pl/configure.ini
src/locale/pl/resources/Tokenizer.php
src/locale/pt/configure.ini
src/locale/pt/resources/Tokenizer.php
src/locale/ru/configure.ini
src/locale/te/configure.ini
src/locale/te/resources/Tokenizer.php
src/locale/th/configure.ini
src/locale/th/resources/Tokenizer.php
src/locale/tr/configure.ini
src/locale/tr/resources/Tokenizer.php
src/locale/vi_VN/configure.ini
src/locale/vi_VN/resources/Tokenizer.php
src/locale/zh_CN/configure.ini
src/locale/zh_CN/resources/Tokenizer.php

diff --git a/src/configs/Config.php b/src/configs/Config.php
index fcb4939fd..f3c8fa999 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -169,7 +169,7 @@ nsdefine('MIN_AD_VERSION', 36);
  * Version number for upgrading locale resource folders and for upgrading
  * public and help wikis
  */
-nsdefine('RESOURCES_WIKI_VERSION', 3);
+nsdefine('RESOURCES_WIKI_VERSION', 4);
 /**
  * nsdefine's the BASE_URL constant for this script
  * if run from the command line as part of index.php HTTP server scrip
diff --git a/src/index.php b/src/index.php
index 8bbafea8e..840b91695 100644
--- a/src/index.php
+++ b/src/index.php
@@ -77,7 +77,7 @@ function bootstrap($web_site = null, $start_new_session = true)
     /**
      * Load global functions related to localization
      */
-    require_once __DIR__."/library/LocaleFunctions.php";
+    require_once __DIR__ . "/library/LocaleFunctions.php";
     ini_set("memory_limit","500M");
     if (!empty($web_site)) {
         if ((empty($_REQUEST['c']) || $_REQUEST['c'] != 'resource')) {
diff --git a/src/library/LocaleFunctions.php b/src/library/LocaleFunctions.php
index 220423453..ec294565c 100755
--- a/src/library/LocaleFunctions.php
+++ b/src/library/LocaleFunctions.php
@@ -100,64 +100,14 @@ function guessLocale()
  */
 function guessLocaleFromString($phrase_string, $locale_tag = null)
 {
-    $original_phrase_string = $phrase_string;
-    $locale_tag = ($locale_tag == null) ? getLocaleTag() : $locale_tag;
-    $sub = C\PUNCT . "|[0-9]|\s";
-    $phrase_string = preg_replace('/' . $sub . '/', "", $phrase_string);
-    $phrase_string = mb_convert_encoding($phrase_string, "UTF-32", "UTF-8");
     $len = strlen($phrase_string);
-    $guess = ['ar' => 0, 'he' => 0, 'hi' => 0, 'ko' => 0, 'ja' => 0, 'ru' => 0,
-        'th' => 0, 'vi' => 0, 'zh-CN' => 0];
-    $guess[$locale_tag] = 1;
-    for ($i = 0; $i < $len; $i += 4) {
-        $start = ord($phrase_string[$i+2]);
-        $next = ord($phrase_string[$i+3]);
-        if ($start >= 6 && $start <= 7) {
-            if ($locale_tag == "fa") {
-                $guess[$locale_tag] +=2;
-            } else {
-                $guess['ar'] += 2;
-            }
-        } else if ($start == 5 && $next >= 144) {
-            $guess['he'] += 2;
-        } else if (($start == 9 && $next < 128) || ($start == 168 &&
-            $next >= 224)) {
-            $guess['hi'] += 2;
-        } else if ($start == 17 || $start >= 172 && $start < 215) {
-            $guess['ko'] += 2;
-        } else if ($start >= 48 && $start <= 49) {
-            $guess['ja'] += 3;
-        } else if ($start == 4 || ($start == 5 && $next < 48)) {
-            $guess['ru']++;
-        } else if ($start == 14 && $next < 128) {
-            $guess['th'] += 2;
-        } else if (($start == 30 && in_array($next, [199, 209, 219])) ||
-            ($start == 1 && in_array($next, [160, 161, 175, 176]))) {
-            $guess['vi'] += 7;
-        } else if ($start >= 78 && $start <= 159) {
-            $guess['zh-CN'] += 4;
-        } else if ($start == 0 && $next < 128) {
-            $guess[$locale_tag]++; // assume ascii is from $locale_tag
-        }
-    }
-    $num_points = ($len / 4) - 1; //there will be a lead and tail space
-    $max = $guess[$locale_tag];
-    if ($num_points >= 0 ) {
-        foreach ($guess as $tag => $cnt) {
-            if ($cnt >= $num_points && $cnt > $max) {
-                $locale_tag = $tag;
-                $max = $cnt;
-                break;
-            }
-        }
-    }
-    if ($locale_tag == 'en-US') {
-        $len = strlen($original_phrase_string);
-        $locale_tag = 'en-US';
-        foreach (['en-US', 'fr-FR', 'es', 'it'] as $lang) {
-            $tokenizer = PhraseParser::getTokenizer($lang);
+    foreach (['ar', 'bn', 'de', 'en-US', 'es', 'fa', 'fr-FR', 'he', 'hi',
+        'in-ID', 'it', 'ja', 'kn', 'ko', 'nl', 'pl', 'pt', 'ru', 'te', 'th',
+        'vi-VN', 'zh-CN'] as $lang) {
+        $tokenizer = PhraseParser::getTokenizer($lang);
+        if ($tokenizer) {
             $test_len =
-                strlen($tokenizer->stopwordsRemover($original_phrase_string));
+                strlen($tokenizer->stopwordsRemover($phrase_string));
             if ($test_len < $len) {
                 $len = $test_len;
                 $locale_tag = $lang;
diff --git a/src/locale/ar/configure.ini b/src/locale/ar/configure.ini
index d2b08b6c1..29ca71c02 100755
--- a/src/locale/ar/configure.ini
+++ b/src/locale/ar/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = "بي إتش بي محرك البحث-يوب!:  %s"
 rss_layout_description = "نتائج البحث ل:  %s"
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/bn/configure.ini b/src/locale/bn/configure.ini
index a8ecef528..d1316a74b 100755
--- a/src/locale/bn/configure.ini
+++ b/src/locale/bn/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = ""
 rss_layout_description = ""
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/bn/resources/Tokenizer.php b/src/locale/bn/resources/Tokenizer.php
index 650e1ee60..ac6bcbe1b 100755
--- a/src/locale/bn/resources/Tokenizer.php
+++ b/src/locale/bn/resources/Tokenizer.php
@@ -37,9 +37,44 @@ namespace seekquarry\yioop\locale\bn\resources;
  */
 class Tokenizer
 {
+    /**
+     * A list of frequently occurring terms for this locale which should
+     * be excluded from certain kinds of queries. This is also used
+     * for language detection
+     * @array
+     */
+    public static $stop_words = ['হিসাবে', 'আমি', 'তার', 'যে', 'তিনি', 'ছিল',
+        'জন্য', 'উপর', 'হয়', 'সঙ্গে', 'তারা', 'হতে', 'এ', 'এক', 'আছে', 'এই',
+        'থেকে', 'দ্বারা', 'গরম', 'শব্দ', 'কিন্তু', 'কি', 'কিছু', 'হয়', 'এটা', 'আপনি',
+        'বা', 'ছিল', 'দী', 'এর', 'থেকে', 'এবং', 'একটি', 'মধ্যে', 'আমরা', 'করতে',
+        'পারেন', 'আউট', 'অন্যান্য', 'ছিল', 'যা', 'কি', 'তাদের', 'সময়', 'যদি',
+        'অভিলাষ', 'কিভাবে', 'তিনি বলেন,', 'একটি', 'প্রতিটি', 'বলুন', 'না', 'সেট',
+        'তিন', 'চান', 'বায়ু', 'ভাল', 'এছাড়াও', 'খেলা', 'ছোট', 'শেষ', 'করা', 'হোম',
+        'পড়া', 'হাত', 'পোর্ট', 'বড়', 'বানান', 'যোগ করা', 'এমনকি', 'জমি', 'এখানে',
+        'অবশ্যই', 'বড়', 'উচ্চ', 'এমন', 'অনুসরণ করা', 'আইন', 'কেন', 'জিজ্ঞাসা',
+        'পুরুষ', 'পরিবর্তন', 'গিয়েছিলাম', 'আলো', 'ধরনের', 'বন্ধ', 'প্রয়োজন', 'ঘর',
+        'ছবি', 'চেষ্টা', 'আমাদের', 'আবার', 'পশু', 'বিন্দু', 'মা', 'বিশ্বের', 'কাছাকাছি',
+        'নির্মাণ', 'স্ব', 'পৃথিবী', 'বাবা'];
     /**
      * How many characters in a char gram for this locale
      * @var int
      */
     public static $char_gram_len = 5;
+    /**
+     * Removes the stop words from the page (used for Word Cloud generation
+     * and language detection)
+     *
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
+     */
+    public static function stopwordsRemover($data)
+    {
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
+    }
 }
diff --git a/src/locale/de/configure.ini b/src/locale/de/configure.ini
index 18b2a829f..a5e1c36c3 100755
--- a/src/locale/de/configure.ini
+++ b/src/locale/de/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = ""
 rss_layout_description = ""
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/de/resources/Tokenizer.php b/src/locale/de/resources/Tokenizer.php
index 924649332..a30d2731b 100755
--- a/src/locale/de/resources/Tokenizer.php
+++ b/src/locale/de/resources/Tokenizer.php
@@ -56,7 +56,8 @@ class Tokenizer
     public static $no_stem_list =["titanic"];
     /**
      * A list of frequently occurring terms for this locale which should
-     * be excluded from certain kinds of queries
+     * be excluded from certain kinds of queries. This is also used
+     * for language detection
      * @array
      */
     public static $stop_words = ['aber', 'alle', 'allem', 'allen', 'aller',
@@ -150,7 +151,8 @@ class Tokenizer
         return $pre_segment;
     }
     /**
-     * Removes the stop words from the page (used for Word Cloud generation)
+     * Removes the stop words from the page (used for Word Cloud generation
+     * and language detection)
      *
      * @param mixed $data either a string or an array of string to remove
      *      stop words from
diff --git a/src/locale/en_US/configure.ini b/src/locale/en_US/configure.ini
index 283fadf7c..895b970db 100644
--- a/src/locale/en_US/configure.ini
+++ b/src/locale/en_US/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = "PHP Search Engine - Yioop! : %s"
 rss_layout_description = "Search results for: %s"
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = "Yioop"
 ;
 ; WikiView.php
diff --git a/src/locale/es/configure.ini b/src/locale/es/configure.ini
index f8b8f46b8..d72298972 100755
--- a/src/locale/es/configure.ini
+++ b/src/locale/es/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = ""
 rss_layout_description = ""
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/fa/configure.ini b/src/locale/fa/configure.ini
index 66dd15532..3cf6077a2 100755
--- a/src/locale/fa/configure.ini
+++ b/src/locale/fa/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = "موتور جستجوی PHP - Yioop! : %s"
 rss_layout_description = "نتایج جستجو برای: %s"
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/fr_FR/configure.ini b/src/locale/fr_FR/configure.ini
index 0854d61ab..adaaffe36 100755
--- a/src/locale/fr_FR/configure.ini
+++ b/src/locale/fr_FR/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = "Moteur de recherche PHP -Yioop! %s"
 rss_layout_description = "%s R&eacute;sultats"
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/he/configure.ini b/src/locale/he/configure.ini
index 2b63603a5..0f71065c5 100755
--- a/src/locale/he/configure.ini
+++ b/src/locale/he/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = ""
 rss_layout_description = ""
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/he/resources/Tokenizer.php b/src/locale/he/resources/Tokenizer.php
index f382fe933..e3a1a5502 100755
--- a/src/locale/he/resources/Tokenizer.php
+++ b/src/locale/he/resources/Tokenizer.php
@@ -37,9 +37,53 @@ namespace seekquarry\yioop\locale\he\resources;
  */
 class Tokenizer
 {
+    /**
+     * A list of frequently occurring terms for this locale which should
+     * be excluded from certain kinds of queries. This is also used
+     * for language detection
+     * @array
+     */
+    public static $stop_words = ['כמו', 'אני', 'שלו', 'ש', 'הוא',
+         'היה', 'עבור', 'על', 'הם', 'עם', 'הם',
+         'להיות', 'ב', 'אחד', 'יש לי', 'זה', 'מ', 'על ידי',
+         'חם', 'מילה', 'אבל', 'מה', 'כמה', 'הוא', 'זה',
+         'אתה', 'או', 'היה לי', 'עבור', 'של', 'אל',
+         'ו', 'זמן', 'ב', 'אנחנו', 'יכול',
+         'את', 'אחר', 'היו', 'ש', 'לעשות',
+         'שלהם', 'זמן', 'אם', 'יהיה', 'איך',
+         'אמר', 'בית', 'כל', 'לספר', 'עושה',
+         'סט', 'שלוש', 'רוצה', 'אוויר', 'גם',
+         'גם', 'לשחק', 'קטן',
+         'סוף', 'לשים', 'בית', 'לקרוא', 'יד', 'נמל', 'גדול',
+         'לאיית', 'להוסיף', 'אפילו', 'ארץ',
+         'כאן', 'חייב', 'גדול', 'גבוה',
+         'כזה', 'מעקב', 'מעשה', 'מדוע',
+         'שואל', 'אנשים', 'לשנות', 'הלכתי',
+         'אור', 'סוג', 'את', 'צריך',
+         'בית', 'תמונה', 'לנסות', 'שלנו',
+         'שוב', 'חיה', 'נקודה',
+         'אמא', 'עולם',
+         'ליד', 'לבנות', 'עצמי', 'כדור הארץ', 'אב'];
     /**
      * How many characters in a char gram for this locale
      * @var int
      */
     public static $char_gram_len = 5;
+    /**
+     * Removes the stop words from the page (used for Word Cloud generation
+     * and language detection)
+     *
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
+     */
+    public static function stopwordsRemover($data)
+    {
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
+    }
 }
diff --git a/src/locale/hi/configure.ini b/src/locale/hi/configure.ini
index 56b381f2f..c3b7f9edc 100755
--- a/src/locale/hi/configure.ini
+++ b/src/locale/hi/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = ""
 rss_layout_description = ""
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/hi/resources/Tokenizer.php b/src/locale/hi/resources/Tokenizer.php
index 846c070aa..a4b1001b6 100755
--- a/src/locale/hi/resources/Tokenizer.php
+++ b/src/locale/hi/resources/Tokenizer.php
@@ -43,6 +43,23 @@ use seekquarry\yioop\configs as C;
  */
 class Tokenizer
 {
+    /**
+     * A list of frequently occurring terms for this locale which should
+     * be excluded from certain kinds of queries. This is also used
+     * for language detection
+     * @array
+     */
+    public static $stop_words = ['जैसा', 'मैं', 'उसके', 'कि', 'वह', 'था', 'के',
+        'लिए', 'पर', 'हैं', 'साथ', 'वे', 'हो', 'पर', 'एक', 'है', 'इस', 'से', 'द्वारा',
+        'गरम', 'शब्द', 'लेकिन', 'क्या', 'कुछ', 'है', 'यह', 'आप', 'या', 'था', 'की',
+        'तक', 'और', 'एक', 'में', 'हम', 'कर', 'सकते', 'हैं', 'बाहर', 'अन्य', 'थे', 'जो',
+        'कर', 'उनके', 'समय', 'अगर', 'होगा', 'कैसे', 'कहा', 'एक', 'प्रत्येक', 'बता',
+        'करता', 'है', 'सेट', 'तीन', 'चाहते हैं', 'हवा', 'अच्छी तरह से', 'भी', 'खेलने',
+        'छोटे', 'अंत', 'डाल', 'घर', 'पढ़ा', 'हाथ', 'बंदरगाह', 'बड़ा', 'जादू', 'जोड़',
+        'और', 'भी', 'भूमि', 'यहाँ', 'चाहिए', 'बड़ा', 'उच्च', 'ऐसा', 'का', 'पालन', 'करें',
+        'अधिनियम', 'क्यों', 'पूछना', 'पुरुषों', 'परिवर्तन', 'चला', 'गया', 'प्रकाश', 'तरह',
+        'बंद', 'आवश्यकता', 'घर', 'तस्वीर', 'कोशिश', 'हमें', 'फिर', 'पशु', 'बिंदु', 'मां',
+        'दुनिया', 'निकट', 'बनाना', 'आत्म', 'पृथ्वी', 'पिता'];
     /**
      * List of verb-like parts of speech that might appear in lexicon
      * @var array
@@ -82,6 +99,23 @@ class Tokenizer
      * @var array
      */
     public static $no_stem_list = [];
+    /**
+     * Removes the stop words from the page (used for Word Cloud generation
+     * and language detection)
+     *
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
+     */
+    public static function stopwordsRemover($data)
+    {
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
+    }
     /**
      * Stub function which could be used for a word segmenter.
      * Such a segmenter on input thisisabunchofwords would output
diff --git a/src/locale/in_ID/configure.ini b/src/locale/in_ID/configure.ini
index 8f4b86bb1..3aa8630a7 100755
--- a/src/locale/in_ID/configure.ini
+++ b/src/locale/in_ID/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = ""
 rss_layout_description = ""
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/in_ID/resources/Tokenizer.php b/src/locale/in_ID/resources/Tokenizer.php
index 372692a5f..5b4fc4abd 100755
--- a/src/locale/in_ID/resources/Tokenizer.php
+++ b/src/locale/in_ID/resources/Tokenizer.php
@@ -40,9 +40,47 @@ use seekquarry\yioop\models\LocaleModel;
  */
 class Tokenizer
 {
+    /**
+     * A list of frequently occurring terms for this locale which should
+     * be excluded from certain kinds of queries. This is also used
+     * for language detection
+     * @array
+     */
+    public static $stop_words = ['sebagai', 'saya', 'nya', 'bahwa', 'dia',
+        'adalah', 'untuk', 'pada', 'adalah', 'dengan', 'mereka', 'menjadi',
+        'di', 'satu', 'memiliki', 'ini', 'dari', 'oleh', 'hot', 'kata',
+        'tapi', 'apa', 'beberapa', 'adalah', 'itu', 'anda', 'atau',
+        'memiliki', 'itu', 'dari', 'untuk', 'dan', 'sebuah', 'di', 'kami',
+        'bisa', 'out', 'lainnya', 'yang', 'yang', 'melakukan', 'mereka',
+        'waktu', 'jika', 'akan', 'bagaimana', 'kata', 'an', 'masing-masing',
+        'memberitahu', 'tidak', 'Kumpulan', 'tiga', 'ingin', 'udara', 'baik',
+        'juga', 'bermain', 'kecil', 'end', 'menempatkan', 'rumah', 'baca',
+        'tangan', 'pelabuhan', 'besar', 'mantra', 'tambahkan', 'bahkan',
+        'tanah', 'di sini', 'harus', 'besar', 'tinggi', 'seperti', 'ikuti',
+        'tindakan', 'mengapa', 'bertanya', 'laki-laki', 'perubahan', 'pergi',
+        'cahaya', 'jenis', 'off', 'perlu', 'rumah', 'gambar', 'coba', 'kami',
+        'lagi', 'hewan', 'titik', 'ibu', 'dunia', 'dekat', 'membangun',
+        'diri', 'bumi', 'ayah'];
     /**
      * How many characters in a char gram for this locale
      * @var int
      */
     public static $char_gram_len = 5;
+    /**
+     * Removes the stop words from the page (used for Word Cloud generation
+     * and language detection)
+     *
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
+     */
+    public static function stopwordsRemover($data)
+    {
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
+    }
 }
diff --git a/src/locale/it/configure.ini b/src/locale/it/configure.ini
index 7e9f9f8a9..193e80e11 100755
--- a/src/locale/it/configure.ini
+++ b/src/locale/it/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = "Yioop! Motore di Ricerca in PHP: %s"
 rss_layout_description = "Risultati di ricerca per: %s"
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/ja/configure.ini b/src/locale/ja/configure.ini
index a8d958685..94bb069e3 100755
--- a/src/locale/ja/configure.ini
+++ b/src/locale/ja/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = ""
 rss_layout_description = ""
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/ja/resources/Tokenizer.php b/src/locale/ja/resources/Tokenizer.php
index ad0faa2e2..04cd56333 100755
--- a/src/locale/ja/resources/Tokenizer.php
+++ b/src/locale/ja/resources/Tokenizer.php
@@ -37,10 +37,45 @@ namespace seekquarry\yioop\locale\ja\resources;
  */
 class Tokenizer
 {
+    /**
+     * A list of frequently occurring terms for this locale which should
+     * be excluded from certain kinds of queries. This is also used
+     * for language detection
+     * @array
+     */
+    public static $stop_words = ['ように', '私は', '彼の', 'その', '彼',
+    'た', 'ために', '上の', 'アール', 'とともに', '彼ら', 'ある', 'アット',
+    '一つ', '持っている', 'この', 'から', 'バイ', 'ホット', '言葉', 'しかし', '何',
+    'いくつかの', 'です', 'それ', 'あなた', 'または', '持っていた', 'インクルード', 'の',
+    'へ', 'そして', 'は', 'で', '我々', '缶', 'アウト', 'その他', 'だった',
+    'これ', 'やる', 'それらの', '時間', 'もし', '意志', '方法', '前記', 'の',
+    'それぞれ', '言う', 'し', 'セット', '個', '欲しい', '空気', 'よく',
+    'また', '遊ぶ', '小さい', '終わり', '置く', 'ホーム', '読む', '手',
+    'ポート', '大きい', 'スペル', '加える', 'さらに', '土地', 'ここに',
+    'しなければならない', '大きい', '高い', 'そのような', '続く', '行為',
+    'なぜ', '頼む', '人々', '変更', '行ってきました', '光', '種類', 'オフ',
+    '必要', '家', '絵', '試す', '私たち', '再び', '動物', 'ポイント', '母',
+    '世界', '近く', 'ビルド', '自己', '地球', '父'];
     /**
      * How many characters in a char gram for this locale
      * @var int
      */
     public static $char_gram_len = 3;
+    /**
+     * Removes the stop words from the page (used for Word Cloud generation
+     * and language detection)
+     *
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
+     */
+    public static function stopwordsRemover($data)
+    {
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/(' . implode('|', self::$stop_words) . ')/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
+    }
 }
-
diff --git a/src/locale/kn/configure.ini b/src/locale/kn/configure.ini
index 5898ce08d..540e943fb 100755
--- a/src/locale/kn/configure.ini
+++ b/src/locale/kn/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = "ಪಿಹೆಚಪಿ ಶೋಧನಾ ಯಂತ್ರ - ಯ
 rss_layout_description = "ಈ ಶೋಧನಾ ಫಲಿತಾಂಶಗಳು ನಿಮ್ಮ ಪ್ರಶ್ನೆ: %s  ಯ ಉತ್ತರ"
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/kn/resources/Tokenizer.php b/src/locale/kn/resources/Tokenizer.php
index d60577de5..1be3d1f65 100755
--- a/src/locale/kn/resources/Tokenizer.php
+++ b/src/locale/kn/resources/Tokenizer.php
@@ -37,9 +37,45 @@ namespace seekquarry\yioop\locale\kn\resources;
  */
 class Tokenizer
 {
+    /**
+     * A list of frequently occurring terms for this locale which should
+     * be excluded from certain kinds of queries. This is also used
+     * for language detection
+     * @array
+     */
+    public static $stop_words = ['ಮಾಹಿತಿ', 'ನಾನು', 'ಅವರ', 'ಆ', 'ಅವರು',
+        'ಆಗಿತ್ತು', 'ಫಾರ್', 'ಮೇಲೆ', 'ಇವೆ', 'ಜೊತೆ', 'ಅವರು', 'ಎಂದು', 'ನಲ್ಲಿ', 'ಒಂದು',
+        'ಹೊಂದಿವೆ', 'ಈ', 'ರಿಂದ', 'ಮೂಲಕ', 'ಬಿಸಿ', 'ಪದ', 'ಆದರೆ', 'ಏನು', 'ಕೆಲವು',
+        'ಆಗಿದೆ', 'ಇದು', 'ನೀವು', 'ಅಥವಾ', 'ಹೊಂದಿತ್ತು', 'ದಿ', 'ನ', 'ಗೆ', 'ಮತ್ತು',
+        'ಒಂದು', 'ರಲ್ಲಿ', 'ನಾವು', 'ಮಾಡಬಹುದು', 'ಔಟ್', 'ಇತರ', 'ಎಂದು', 'ಇದು',
+        'ಹಾಗೆ', 'ತಮ್ಮ', 'ಸಮಯ', 'ವೇಳೆ', 'ತಿನ್ನುವೆ', 'ಹೇಗೆ', 'ಹೇಳಿದರು', 'ಒಂದು',
+        'ಪ್ರತಿ', 'ಹೇಳಲು', 'ಮಾಡುತ್ತದೆ', 'ಸೆಟ್', 'ಮೂರು', 'ಬಯಸುವ', 'ಗಾಳಿ', 'ಹಾಗೂ',
+        'ಸಹ', 'ಆಡಲು', 'ಸಣ್ಣ', 'ಕೊನೆಯಲ್ಲಿ', 'ಪುಟ್', 'ಮನೆ', 'ಓದಲು', 'ಕೈ', 'ಬಂದರು',
+        'ದೊಡ್ಡ', 'ಕಾಗುಣಿತ', 'ಸೇರಿಸಬಹುದು', 'ಸಹ', 'ಭೂಮಿ', 'ಇಲ್ಲಿ', 'ಮಾಡಬೇಕಾಗುತ್ತದೆ',
+        'ದೊಡ್ಡ', 'ಹೆಚ್ಚಿನ', 'ಇಂತಹ', 'ಅನುಸರಿಸಿ', 'ಆಕ್ಟ್', 'ಏಕೆ', 'ಕೇಳಿ', 'ಪುರುಷರು',
+        'ಬದಲಾವಣೆ', 'ಹೋದರು', 'ಬೆಳಕಿನ', 'ರೀತಿಯ', 'ಆಫ್', 'ಅಗತ್ಯವಿದೆ', 'ಮನೆ', 'ಚಿತ್ರ',
+        'ಪ್ರಯತ್ನಿಸಿ', 'ನಮಗೆ', 'ಮತ್ತೆ', 'ಪ್ರಾಣಿ', 'ಪಾಯಿಂಟ್', 'ತಾಯಿ', 'ವಿಶ್ವದ', 'ಬಳಿ',
+        'ನಿರ್ಮಿಸಲು', 'ಸ್ವಯಂ', 'ಭೂಮಿಯ', 'ತಂದೆ'];
     /**
      * How many characters in a char gram for this locale
      * @var int
      */
     public static $char_gram_len = 5;
+    /**
+     * Removes the stop words from the page (used for Word Cloud generation
+     * and language detection)
+     *
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
+     */
+    public static function stopwordsRemover($data)
+    {
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
+    }
 }
diff --git a/src/locale/ko/configure.ini b/src/locale/ko/configure.ini
index 028e5e469..e505c6ac7 100755
--- a/src/locale/ko/configure.ini
+++ b/src/locale/ko/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = "PHP 검색 엔진 - Yioop! : %s"
 rss_layout_description = "%s 에 대한 서치 결과:"
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/ko/resources/Tokenizer.php b/src/locale/ko/resources/Tokenizer.php
index 74a23d38e..776813f51 100755
--- a/src/locale/ko/resources/Tokenizer.php
+++ b/src/locale/ko/resources/Tokenizer.php
@@ -39,9 +39,43 @@ use seekquarry\yioop\models\LocaleModel;
  */
 class Tokenizer
 {
+    /**
+     * A list of frequently occurring terms for this locale which should
+     * be excluded from certain kinds of queries. This is also used
+     * for language detection
+     * @array
+     */
+    public static $stop_words = ['로', '나는', '그의', '그', '그', '했다',
+        '에 대한', '에', '아르', '와', '그들', '있다', '에', '일', '이', '이',
+        '부터', '에 의해', '뜨거운', '단어', '하지만', '무엇', '다소', '이다', '그',
+        '당신', '또는', '했다', '에', '의', '에', '과', '이', '에', '우리', '수',
+        '아웃', '다른', '했다', '하는', '할', '자신의', '시간', '면', '것', '방법',
+        '말했다', '이', '각', '이야기', '하지', '세트', '세', '필요', '공기', '잘',
+        '또한', '재생', '작은', '끝', '넣어', '홈', '읽기', '손', '포트', '큰',
+        '철자', '추가', '도', '땅', '여기', '해야', '큰', '높은', '이러한', '따라',
+        '행위', '이유', '문의', '남자', '변경', '갔다', '빛', '종류', '오프',
+        '필요가있다', '집', '사진', '시험', '우리', '다시', '동물', '포인트',
+        '어머니', '세계', '가까운', '구축', '자기', '지구', '아버지'];
     /**
      * How many characters in a char gram for this locale
      * @var int
      */
     public static $char_gram_len = 3;
+    /**
+     * Removes the stop words from the page (used for Word Cloud generation
+     * and language detection)
+     *
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
+     */
+    public static function stopwordsRemover($data)
+    {
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/(' . implode('|', self::$stop_words) . ')/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
+    }
 }
diff --git a/src/locale/nl/configure.ini b/src/locale/nl/configure.ini
index 285828dd8..52b2650d3 100644
--- a/src/locale/nl/configure.ini
+++ b/src/locale/nl/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = "PHP Search Engine - Yioop! :%S"
 rss_layout_description = "Zoek resultaten voor: %s"
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = "Yioop"
 ;
 ; WikiView.php
diff --git a/src/locale/nl/resources/Tokenizer.php b/src/locale/nl/resources/Tokenizer.php
index 082d85833..f39dfedb4 100755
--- a/src/locale/nl/resources/Tokenizer.php
+++ b/src/locale/nl/resources/Tokenizer.php
@@ -36,6 +36,26 @@ namespace seekquarry\yioop\locale\nl\resources;
  */
 class Tokenizer
 {
+    /**
+     * A list of frequently occurring terms for this locale which should
+     * be excluded from certain kinds of queries. This is also used
+     * for language detection
+     * @array
+     */
+    public static $stop_words = ['als', 'I', 'zijn', 'dat', 'hij', 'was',
+        'voor', 'op', 'zijn', 'met', 'ze', 'zijn', 'bij', 'een', 'hebben',
+        'deze', 'van', 'door', 'heet', 'woord', 'maar', 'wat', 'sommige',
+        'is', 'het', 'u', 'of', 'had', 'de', 'van', 'aan', 'en', 'een',
+        'in', 'we', 'kan', 'uit', 'andere', 'waren', 'die', 'doen', 'hun',
+        'tijd', 'indien', 'zal', 'hoe', 'zei', 'een', 'elk', 'vertellen',
+        'doet', 'set', 'drie', 'willen', 'lucht', 'goed', 'ook', 'spelen',
+        'klein', 'end', 'zetten', 'thuis', 'lezen', 'de hand', 'poort',
+        'grote', 'spell', 'toevoegen', 'zelfs', 'land', 'hier', 'moet',
+        'grote', 'hoog', 'dergelijke', 'volgen', 'act', 'waarom', 'vragen',
+        'mannen', 'verandering', 'ging', 'licht', 'soort', 'uitgeschakeld',
+        'nodig', 'huis', 'afbeelding', 'proberen', 'ons', 'weer', 'dier',
+        'punt', 'moeder', 'wereld', 'dichtbij', 'bouwen', 'zelf', 'aarde',
+        'vader'];
     /**
      * Words we don't want to be stemmed
      * @var array
@@ -52,6 +72,23 @@ class Tokenizer
         "sme", "spe", "ste", "the", "tje", "uce", "uden", "uien", "uren",
         "use", "uwe", "vse", "ype"
     ];
+    /**
+     * Removes the stop words from the page (used for Word Cloud generation
+     * and language detection)
+     *
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
+     */
+    public static function stopwordsRemover($data)
+    {
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
+    }
     /**
      * Stub function which could be used for a word segmenter.
      * Such a segmenter on input thisisabunchofwords would output
diff --git a/src/locale/pl/configure.ini b/src/locale/pl/configure.ini
index 1288a8cc8..ab35706c7 100755
--- a/src/locale/pl/configure.ini
+++ b/src/locale/pl/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = ""
 rss_layout_description = ""
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/pl/resources/Tokenizer.php b/src/locale/pl/resources/Tokenizer.php
index e27e344cf..6f4ed36a9 100755
--- a/src/locale/pl/resources/Tokenizer.php
+++ b/src/locale/pl/resources/Tokenizer.php
@@ -37,9 +37,46 @@ namespace seekquarry\yioop\locale\pl\resources;
  */
 class Tokenizer
 {
+    /**
+     * A list of frequently occurring terms for this locale which should
+     * be excluded from certain kinds of queries. This is also used
+     * for language detection
+     * @array
+     */
+    public static $stop_words = ['jak', 'I', 'jego', 'że', 'on', 'było', 'dla',
+        'na', 'są', 'zespół', 'oni', 'być', 'w', 'jeden', 'mieć', 'tego', 'z',
+        'przez', 'gorący', 'słowo', 'ale', 'co', 'niektóre', 'jest', 'to',
+        'ty', 'lub', 'miał', 'kilka', 'stopa', 'do', 'i', 'ciągnąć', 'w',
+        'my', 'puszka', 'na zewnątrz', 'inne', 'były', 'który', 'zrobić',
+        'ich', 'czas', 'jeśli', 'będzie', 'jak', 'powiedział', 'próba',
+        'każda', 'powiedzieć', 'nie', 'zestaw', 'trzy', 'chcą', 'powietrze',
+        'dobrze', 'również', 'grać', 'mały', 'koniec', 'wkładać',
+        'Strona', 'główna', 'czytaj', 'ręka', 'port', 'duży', 'zaklęcie',
+        'dodać', 'nawet', 'ziemia', 'tutaj', 'musi', 'duży', 'wysoki',
+        'takie', 'śledzić', 'akt', 'dlaczego', 'zapytaj', 'mężczyźni',
+        'zmiana', 'poszedł', 'światła', 'rodzaj', 'z', 'potrzeba', 'dom',
+        'obraz', 'spróbuj', 'nas', 'ponownie', 'zwierząt', 'punkt', 'matka',
+        'świat', 'blisko', 'budować', 'własny', 'ziemia', 'ojciec'];
     /**
      * How many characters in a char gram for this locale
      * @var int
      */
     public static $char_gram_len = 5;
+    /**
+     * Removes the stop words from the page (used for Word Cloud generation
+     * and language detection)
+     *
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
+     */
+    public static function stopwordsRemover($data)
+    {
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
+    }
 }
diff --git a/src/locale/pt/configure.ini b/src/locale/pt/configure.ini
index 6328b4f8d..986c5bfb6 100755
--- a/src/locale/pt/configure.ini
+++ b/src/locale/pt/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = ""
 rss_layout_description = ""
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/pt/resources/Tokenizer.php b/src/locale/pt/resources/Tokenizer.php
index df73e52bf..10a3cf63f 100755
--- a/src/locale/pt/resources/Tokenizer.php
+++ b/src/locale/pt/resources/Tokenizer.php
@@ -38,6 +38,27 @@ namespace seekquarry\yioop\locale\pt\resources;
  */
 class Tokenizer
 {
+    /**
+     * A list of frequently occurring terms for this locale which should
+     * be excluded from certain kinds of queries. This is also used
+     * for language detection
+     * @array
+     */
+    public static $stop_words = ['como', 'I', 'seu', 'ele', 'foi', 'para', 'em',
+        'são', 'com', 'eles', 'ser', 'em', 'uma', 'tem', 'este',
+        'partir', 'de', 'por', 'quente', 'palavra', 'mas',
+        'que', 'alguns', 'é', 'ele', 'você', 'ou', 'teve',
+        'o', 'a', 'e', 'uma', 'em', 'nós', 'lata', 'fora',
+        'outro', 'foram', 'que', 'fazer', 'seu', 'tempo', 'se',
+        'vontade', 'como', 'disse', 'uma', 'cada', 'dizer', 'faz',
+        'conjunto', 'três', 'quer', 'ar', 'bem', 'também', 'jogar',
+        'pequeno', 'fim', 'colocar', 'casa', 'ler', 'mão', 'port',
+        'grande', 'soletrar', 'adicionar', 'mesmo', 'terra', 'aqui',
+        'necessário', 'grande', 'alto', 'tais', 'siga', 'ato',
+        'perguntar', 'homens', 'mudança', 'fui', 'luz',
+        'tipo', 'off', 'precisa', 'casa', 'imagem', 'tentar', 'nós',
+        'novamente', 'animais', 'ponto', 'mãe', 'mundo', 'perto',
+        'construir', 'auto', 'terra', 'pai'];
     /**
      * Phrases we would like yioop to rewrite before performing a query
      * @var array
@@ -75,6 +96,23 @@ class Tokenizer
      * @var string
      */
     private static $rv = "";
+    /**
+     * Removes the stop words from the page (used for Word Cloud generation
+     * and language detection)
+     *
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
+     */
+    public static function stopwordsRemover($data)
+    {
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
+    }
     /**
      * Stub function which could be used for a word segmenter.
      * Such a segmenter on input thisisabunchofwords would output
diff --git a/src/locale/ru/configure.ini b/src/locale/ru/configure.ini
index 976db947b..2c0ac78d6 100755
--- a/src/locale/ru/configure.ini
+++ b/src/locale/ru/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = ""
 rss_layout_description = ""
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/te/configure.ini b/src/locale/te/configure.ini
index cca42f7bd..6888ac855 100644
--- a/src/locale/te/configure.ini
+++ b/src/locale/te/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = ""
 rss_layout_description = ""
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = "Yioop"
 ;
 ; WikiView.php
diff --git a/src/locale/te/resources/Tokenizer.php b/src/locale/te/resources/Tokenizer.php
index 73f23b363..d38b26287 100755
--- a/src/locale/te/resources/Tokenizer.php
+++ b/src/locale/te/resources/Tokenizer.php
@@ -39,9 +39,45 @@ use seekquarry\yioop\models\LocaleModel;
  */
 class Tokenizer
 {
+    /**
+     * A list of frequently occurring terms for this locale which should
+     * be excluded from certain kinds of queries. This is also used
+     * for language detection
+     * @array
+     */
+    public static $stop_words = ['గా', 'నేను', 'తన', 'ఆ', 'అతను', 'ఉంది',
+        'కోసం', 'న', 'ఉన్నాయి', 'తో', 'వారు', 'ఉంటుంది', 'వద్ద', 'ఒకటి', 'కలిగి',
+        'ఈ', 'నుండి', 'ద్వారా', 'వేడి', 'పదం', 'కానీ', 'ఏమి', 'కొన్ని', 'ఉంది',
+        'ఇది', 'మీరు', 'లేదా', 'వచ్చింది', 'ది', 'యొక్క', 'కు', 'మరియు', 'ఒక',
+        'లో', 'మేము', 'చెయ్యవచ్చు', 'అవుట్', 'ఇతర', 'ఉన్నాయి', 'ఇది', 'చేయండి',
+        'వారి', 'సమయం', 'ఉంటే', 'రెడీ', 'ఎలా', 'అన్నాడు', 'ఒక', 'ప్రతి', 'చెప్పండి',
+        'చేస్తుంది', 'సెట్', 'మూడు', 'కావలసిన', 'గాలి', 'బాగా', 'కూడా', 'ప్లే',
+        'చిన్న', 'ముగింపు', 'చాలు', 'హోమ్', 'చదవడానికి', 'చేతి', 'పోర్ట్', 'పెద్ద',
+        'అక్షరక్రమ', 'జోడించండి', 'కూడా', 'భూమి', 'ఇక్కడ', 'తప్పక', 'పెద్ద', 'అధిక',
+        'ఇటువంటి', 'అనుసరించండి', 'చట్టం', 'ఎందుకు', 'గోవా', 'పురుషులు', 'మార్పు',
+        'వెళ్ళింది', 'కాంతి', 'రకం', 'ఆఫ్', 'అవసరం', 'ఇల్లు', 'చిత్రాన్ని', 'ప్రయత్నించండి',
+        'మాకు', 'మళ్ళీ', 'జంతు', 'పాయింట్', 'తల్లి', 'ప్రపంచ', 'సమీపంలో',
+        'నిర్మించడానికి', 'స్వీయ', 'భూమి', 'తండ్రి'];
     /**
      * How many characters in a char gram for this locale
      * @var int
      */
     public static $char_gram_len = 5;
+    /**
+     * Removes the stop words from the page (used for Word Cloud generation
+     * and language detection)
+     *
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
+     */
+    public static function stopwordsRemover($data)
+    {
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
+    }
 }
diff --git a/src/locale/th/configure.ini b/src/locale/th/configure.ini
index 8014cab32..0f5e3dde1 100755
--- a/src/locale/th/configure.ini
+++ b/src/locale/th/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = ""
 rss_layout_description = ""
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/th/resources/Tokenizer.php b/src/locale/th/resources/Tokenizer.php
index fd9f2d154..d5d0734f8 100755
--- a/src/locale/th/resources/Tokenizer.php
+++ b/src/locale/th/resources/Tokenizer.php
@@ -37,9 +37,44 @@ namespace seekquarry\yioop\locale\th\resources;
  */
 class Tokenizer
 {
+    /**
+     * A list of frequently occurring terms for this locale which should
+     * be excluded from certain kinds of queries. This is also used
+     * for language detection
+     * @array
+     */
+    public static $stop_words = ['เป็น', 'ฉัน', 'ของเขา', 'ว่า', 'เขา', 'เป็น',
+        'สำหรับ', 'บน', 'มี', 'ด้วย', 'พวกเขา', 'จะเป็น', 'ที่', 'หนึ่ง', 'มี',
+        'นี้', 'จาก', 'โดย', 'ร้อน', 'คำ', 'แต่', 'สิ่งที่', 'บาง', 'เป็น', 'มัน',
+        'คุณ', 'หรือ', 'มี', 'ได้', 'ของ', 'ที่จะ', 'และ', 'กรอก', 'ใน', 'เรา',
+        'สามารถ', 'ออก', 'อื่น ๆ', 'เป็น', 'ซึ่ง', 'ทำ', 'ของพวกเขา', 'เวลา',
+        'ถ้า', 'จะ', 'วิธี', 'กล่าวว่า', 'บิน', 'แต่ละ', 'บอก', 'ไม่', 'ชุด', 'สาม',
+        'ต้องการ', 'อากาศ', 'ดี', 'ด้วย', 'เล่น', 'ขนาดเล็ก', 'ปลาย', 'ใส่', 'บ้าน',
+        'อ่าน', 'มือ', 'พอร์ต', 'ที่มีขนาดใหญ่', 'สะกด', 'เพิ่ม', 'แม้กระทั่ง', 'ที่ดิน',
+        'ที่นี่', 'ต้อง', 'ใหญ่', 'สูง', 'เช่น', 'ทำตาม', 'การกระทำ', 'เหตุผลที่',
+        'ขอให้', 'ผู้ชาย', 'การเปลี่ยนแปลง', 'ไป', 'แสง', 'ชนิด', 'ออก', 'ต้อง',
+        'บ้าน', 'ภาพ', 'พยายาม', 'เรา', 'อีกครั้ง', 'สัตว์', 'จุด', 'แม่', 'โลก',
+        'อยู่ใกล้', 'สร้าง', 'ตนเอง', 'โลก', 'พ่อของ'];
     /**
      * How many characters in a char gram for this locale
      * @var int
      */
     public static $char_gram_len = 5;
+    /**
+     * Removes the stop words from the page (used for Word Cloud generation
+     * and language detection)
+     *
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
+     */
+    public static function stopwordsRemover($data)
+    {
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
+    }
 }
diff --git a/src/locale/tr/configure.ini b/src/locale/tr/configure.ini
index d2ee070a2..30ca2e314 100755
--- a/src/locale/tr/configure.ini
+++ b/src/locale/tr/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = ""
 rss_layout_description = ""
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/tr/resources/Tokenizer.php b/src/locale/tr/resources/Tokenizer.php
index 73cbd297e..16f4d7eaf 100755
--- a/src/locale/tr/resources/Tokenizer.php
+++ b/src/locale/tr/resources/Tokenizer.php
@@ -38,9 +38,46 @@ namespace seekquarry\yioop\locale\tr\resources;
  */
 class Tokenizer
 {
+    /**
+     * A list of frequently occurring terms for this locale which should
+     * be excluded from certain kinds of queries. This is also used
+     * for language detection
+     * @array
+     */
+    public static $stop_words = ['olarak', 'ben', 'onun', 'bu', 'diye',
+        'oldu', 'için', 'üzerinde', 'vardır', 'ile', 'onlar', 'olmak', 'at',
+        'bir', 'var', 'Bu', 'dan', 'tarafından', 'sıcak', 'kelime', 'ancak',
+        'ne', 'bazı', 'olduğunu', 'o', 'sen', 'veya', 'vardı', '', 'arasında',
+        'karşı', 've', 'bir', 'içinde', 'biz', 'can', 'üzerinden', 'diğer',
+        'vardı', 'hangi', 'do', 'onların', 'zaman', 'eğer', 'olacak',
+        'nasıl', 'dedi', 'bir', 'her', 'söyle', 'yok', 'set', 'üç',
+        'istiyorum', 'hava', 'iyi', 'ayrıca', 'oynamak', 'küçük', 'son',
+        'koymak', 'ev', 'okumak', 'el', 'liman', 'büyük', 'büyü', 'ekleyin',
+        'hatta', 'arazi', 'burada', 'gerekir', 'büyük', 'yüksek', 'böyle',
+        'izleyin', 'hareket', 'neden', 'sormak', 'erkekler', 'değişim',
+        'gitti', 'ışık', 'tür', 'kapalı', 'gerek', 'ev', 'resim', 'denemek',
+        'bizi', 'tekrar', 'hayvan', 'nokta', 'anne', 'dünya', 'yakın',
+        'inşa', 'etmek', 'öz', 'toprak', 'baba'];
     /**
      * How many characters in a char gram for this locale
      * @var int
      */
     public static $char_gram_len = 5;
+    /**
+     * Removes the stop words from the page (used for Word Cloud generation
+     * and language detection)
+     *
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
+     */
+    public static function stopwordsRemover($data)
+    {
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
+    }
 }
diff --git a/src/locale/vi_VN/configure.ini b/src/locale/vi_VN/configure.ini
index a01e2386b..313406dca 100755
--- a/src/locale/vi_VN/configure.ini
+++ b/src/locale/vi_VN/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = ""
 rss_layout_description = ""
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/vi_VN/resources/Tokenizer.php b/src/locale/vi_VN/resources/Tokenizer.php
index f6ee9cd39..9547380aa 100755
--- a/src/locale/vi_VN/resources/Tokenizer.php
+++ b/src/locale/vi_VN/resources/Tokenizer.php
@@ -39,5 +39,42 @@ namespace seekquarry\yioop\locale\vi_VN\resources;
  */
 class Tokenizer
 {
-
-}
\ No newline at end of file
+    /**
+     * A list of frequently occurring terms for this locale which should
+     * be excluded from certain kinds of queries. This is also used
+     * for language detection
+     * @array
+     */
+    public static $stop_words = ['như', 'tôi', 'mình', 'mà', 'ông', 'là',
+        'cho', 'trên', 'là', 'với', 'họ', 'được', 'tại', 'một', 'có',
+        'này', 'từ', 'bởi', 'nóng', 'từ', 'nhưng', 'những', 'gì', 'một',
+        'số', 'là', 'nó', 'anh', 'hoặc', 'có', 'các', 'của', 'để', 'và',
+        'một', 'trong', 'chúng', 'tôi', 'có', 'thể', 'ra', 'khác', 'là',
+        'mà', 'làm', 'của', 'họ', 'thời', 'gian', 'nếu', 'sẽ', 'như', 'thế',
+        'nào', 'nói', 'một', 'môi', 'nói ', 'không', 'bộ', 'ba', 'muốn',
+        'không', 'khí', 'cũng', 'cũng', 'chơi', 'nhỏ', 'cuố', 'đặt', 'nhà',
+        'đọc', 'tay', 'cổng', 'lớn', 'chính', 'tả', 'thêm', 'thậm', 'chí',
+        'đất', 'ở', 'đây', 'phải', 'lớn', 'cao', 'như', 'vậy', 'theo',
+        'hành', 'động', 'lý', 'do ', 'tại ', 'sao', 'xin', 'người', 'đàn',
+        'ông', 'thay', 'đổi', 'đi', 'ánh', 'sáng', 'loại', 'tắt', 'cần', 'nhà',
+        'hình', 'ảnh', 'thử', 'chúng', 'tôi', 'một ', 'lần', 'nữa', 'động',
+        'vật', 'điểm', 'mẹ', 'thế', 'giới', 'gần', 'xây', 'dựng', 'tự', 'đất',
+        'cha'];
+    /**
+     * Removes the stop words from the page (used for Word Cloud generation
+     * and language detection)
+     *
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
+     */
+    public static function stopwordsRemover($data)
+    {
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
+    }
+}
diff --git a/src/locale/zh_CN/configure.ini b/src/locale/zh_CN/configure.ini
index f1355ae2d..7324865eb 100755
--- a/src/locale/zh_CN/configure.ini
+++ b/src/locale/zh_CN/configure.ini
@@ -1691,7 +1691,7 @@ rss_layout_title = ""
 rss_layout_description = ""
 ;
 ; View.php
-view_locale_version = "3"
+view_locale_version = "4"
 view_logo_alt_text = ""
 ;
 ; WikiView.php
diff --git a/src/locale/zh_CN/resources/Tokenizer.php b/src/locale/zh_CN/resources/Tokenizer.php
index 8202c25a0..c50dc568e 100755
--- a/src/locale/zh_CN/resources/Tokenizer.php
+++ b/src/locale/zh_CN/resources/Tokenizer.php
@@ -41,13 +41,37 @@ use seekquarry\yioop\library\PhraseParser;
 class Tokenizer
 {
     /**
-     * Removes the stop words from the page
-     * @param string $page the page to remove stop words from.
-     * @return string $page with no stop words
+     * A list of frequently occurring terms for this locale which should
+     * be excluded from certain kinds of queries. This is also used
+     * for language detection
+     * @array
      */
-    public static function stopwordsRemover($page)
+    public static $stop_words = ['一', '人', '里', '会', '没', '她', '吗', '去',
+        '也', '有', '这', '那', '不', '什', '个', '来', '要', '就', '我', '你',
+        '的', '是', '了', '他', '么', '们', '在', '说', '为', '好', '吧', '知道',
+        '我的', '和', '你的', '想', '只', '很', '都', '对', '把', '啊', '怎', '得',
+        '还', '过', '不是', '到', '样', '飞', '远', '身', '任何', '生活', '够',
+        '号', '兰', '瑞', '达', '或', '愿', '蒂', '別', '军', '正', '是不是',
+        '证', '不用', '三', '乐', '吉', '男人', '告訴', '路', '搞', '可是',
+        '与', '次', '狗', '决', '金', '史', '姆', '部', '正在', '活', '刚',
+        '回家', '贝', '如何', '须', '战', '不會', '夫', '喂', '父', '亚', '肯定',
+        '女孩', '世界'];
+    /**
+     * Removes the stop words from the page (used for Word Cloud generation
+     * and language detection)
+     *
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
+     */
+    public static function stopwordsRemover($data)
     {
-        return $page;
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/(' . implode('|', self::$stop_words) . ')/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
     }
     /**
      * A word segmenter.

ViewGit