Filename | |
---|---|
src/locale/fa/resources/Tokenizer.php | |
tests/FaTokenizerTest.php | |
tests/test_files/persian_stemmer/stemmed_result.txt |
diff --git a/src/locale/fa/resources/Tokenizer.php b/src/locale/fa/resources/Tokenizer.php index e43e8cbf4..deb7da78e 100755 --- a/src/locale/fa/resources/Tokenizer.php +++ b/src/locale/fa/resources/Tokenizer.php @@ -30,7 +30,8 @@ namespace seekquarry\yioop\locale\fa\resources; /** * Persian specific tokenization code. In particular, it has a stemmer, - * The stemmer is my stab at porting Nick Patch's Perl port, + * The stemmer is a modified variant (handling prefixes slightly differently) + * of my stab at porting Nick Patch's Perl port, * https://metacpan.org/pod/Lingua::Stem::UniNE::FA, of the * stemming algorithm by Ljiljana Dolamic and Jacques * Savoy of the University of Neuchâtel. The Java version of this is at @@ -138,11 +139,25 @@ class Tokenizer return $word; } $word = mb_strtolower($word); + $word = self::simplifyPrefix($word); $word = self::removeKasra($word); $word = self::removeSuffix($word); $word = self::removeKasra($word); return $word; } + /** + * Simplifies prefixes beginning with آ to ا + * @param string $word word to remove mark from + * @return string result of removal + */ + private static function simplifyPrefix($word) + { + if(mb_strlen($word) < 5) { + return $word; + } + $word = preg_replace('/^آ/u', "ا", $word); + return $word; + } /** * Removes a Kasra diacritic mark if appears * at the end of a word. diff --git a/tests/FaTokenizerTest.php b/tests/FaTokenizerTest.php index 923c66341..c7a855cd4 100644 --- a/tests/FaTokenizerTest.php +++ b/tests/FaTokenizerTest.php @@ -94,7 +94,6 @@ class FaTokenizerTest extends UnitTest $word_stem = $tokenizer->stem($word); if ($stem != $word_stem) { echo "Stemming $word to $word_stem should be $stem\n"; - exit(); } $this->assertEqual($word_stem, $stem, "function stem correctly stems diff --git a/tests/test_files/persian_stemmer/stemmed_result.txt b/tests/test_files/persian_stemmer/stemmed_result.txt index cf9702d5e..a8df5f964 100644 --- a/tests/test_files/persian_stemmer/stemmed_result.txt +++ b/tests/test_files/persian_stemmer/stemmed_result.txt @@ -17,7 +17,7 @@ عمل كرد خود -آيند +ايند مزايا استفاد دراز @@ -81,13 +81,13 @@ ساد تركيب بعد -آميز +اميز استحك نقو پرد نظم درون -آناتوم +اناتوم اندا ديگر مشخص @@ -101,7 +101,7 @@ سرهاي بزرگ اندا -آميز +اميز گرفت نقا تغيير