Tweak at modifying prefix handling in Persian stemmer, a=chris

Chris Pollett [2015-08-23 23:Aug:rd]

Tweak at modifying prefix handling in Persian stemmer, a=chris

Filename
src/locale/fa/resources/Tokenizer.php
tests/FaTokenizerTest.php
tests/test_files/persian_stemmer/stemmed_result.txt

diff --git a/src/locale/fa/resources/Tokenizer.php b/src/locale/fa/resources/Tokenizer.php
index e43e8cbf4..deb7da78e 100755
--- a/src/locale/fa/resources/Tokenizer.php
+++ b/src/locale/fa/resources/Tokenizer.php
@@ -30,7 +30,8 @@ namespace seekquarry\yioop\locale\fa\resources;

 /**
  * Persian specific tokenization code. In particular, it has a stemmer,
- * The stemmer is my stab at porting Nick Patch's Perl port,
+ * The stemmer is a modified variant (handling prefixes slightly differently)
+ * of my stab at porting Nick Patch's Perl port,
  * https://metacpan.org/pod/Lingua::Stem::UniNE::FA, of the
  * stemming algorithm by Ljiljana Dolamic and Jacques
  * Savoy of the University of Neuchâtel. The Java version of this is at
@@ -138,11 +139,25 @@ class Tokenizer
             return $word;
         }
         $word = mb_strtolower($word);
+        $word = self::simplifyPrefix($word);
         $word = self::removeKasra($word);
         $word = self::removeSuffix($word);
         $word = self::removeKasra($word);
         return $word;
     }
+    /**
+     * Simplifies prefixes beginning with آ to ا
+     * @param string $word word to remove mark from
+     * @return string result of removal
+     */
+    private static function simplifyPrefix($word)
+    {
+        if(mb_strlen($word) < 5) {
+            return $word;
+        }
+        $word = preg_replace('/^آ/u', "ا", $word);
+        return $word;
+    }
     /**
      * Removes a Kasra diacritic mark if appears
      * at the end of a word.
diff --git a/tests/FaTokenizerTest.php b/tests/FaTokenizerTest.php
index 923c66341..c7a855cd4 100644
--- a/tests/FaTokenizerTest.php
+++ b/tests/FaTokenizerTest.php
@@ -94,7 +94,6 @@ class FaTokenizerTest extends UnitTest
             $word_stem = $tokenizer->stem($word);
             if ($stem != $word_stem) {
                 echo "Stemming $word to $word_stem should be $stem\n";
-                exit();
             }
             $this->assertEqual($word_stem,
                     $stem, "function stem correctly stems
diff --git a/tests/test_files/persian_stemmer/stemmed_result.txt b/tests/test_files/persian_stemmer/stemmed_result.txt
index cf9702d5e..a8df5f964 100644
--- a/tests/test_files/persian_stemmer/stemmed_result.txt
+++ b/tests/test_files/persian_stemmer/stemmed_result.txt
@@ -17,7 +17,7 @@
 عمل
 كرد
 خود
-آيند
+ايند
 مزايا
 استفاد
 دراز
@@ -81,13 +81,13 @@
 ساد
 تركيب
 بعد
-آميز
+اميز
 استحك
 نقو
 پرد
 نظم
 درون
-آناتوم
+اناتوم
 اندا
 ديگر
 مشخص
@@ -101,7 +101,7 @@
 سرهاي
 بزرگ
 اندا
-آميز
+اميز
 گرفت
 نقا
 تغيير

ViewGit