Last commit for src/locale/fa/resources/Tokenizer.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]

Adjust copyrights years

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2015  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * @author Chris Pollett chris@pollett.org
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2015
 * @filesource
 */
namespace seekquarry\yioop\locale\fa\resources;

/**
 * Persian specific tokenization code. In particular, it has a stemmer,
 * The stemmer is my stab at porting Nick Patch's Perl port,
 * https://metacpan.org/pod/Lingua::Stem::UniNE::FA, of the
 * stemming algorithm by Ljiljana Dolamic and Jacques
 * Savoy of the University of Neuchâtel. The Java version of this is at
 * http://members.unine.ch/jacques.savoy/clef/persianStemmerUnicode.txt
 * (beware of Java's handling of Unicode).
 * Here given a word, its stem is that part of the word that
 * is common to all its inflected variants. For example,
 * tall is common to tall, taller, tallest. A stemmer takes
 * a word and tries to produce its stem.
 *
 * @author Chris Pollett
 * @package seek_quarry\locale\fa
 */
class Tokenizer
{
    /**
     * Words we don't want to be stemmed
     * @var array
     */
    public static $no_stem_list = [];
    /**
     * Stub function which could be used for a word segmenter.
     * Such a segmenter on input thisisabunchofwords would output
     * this is a bunch of words
     *
     * @param string $pre_segment  before segmentation
     * @return string should return string with words separated by space
     *     in this case does nothing
     */
    public static function segment($pre_segment)
    {
        return $pre_segment;
    }
    /**
     * Removes the stop words from the page (used for Word Cloud generation)
     *
     * @param string $page the page to remove stop words from.
     * @return string $page with no stop words
     */
    public static function stopwordsRemover($page)
    {
        $stop_words = [
            "در", "به", "از", "كه", "مي", "اين", "است", "را", "با", "هاي",
            "براي", "آن", "يك", "شود", "شده","خود", "ها", "كرد", "شد", "اي",
            "تا", "كند", "بر", "بود", "گفت", "نيز", "وي", "هم", "كنند",
            "دارد", "ما", "كرده", "يا", "اما", "بايد", "دو", "اند", "هر",
            "خواهد", "او", "مورد", "آنها", "باشد", "ديگر", "مردم", "نمي",
            "بين", "پيش", "پس", "اگر", "همه", "صورت", "يكي", "هستند",
            "بي", "من", "دهد", "هزار", "نيست", "استفاده", "داد", "داشته",
            "راه", "داشت", "چه", "همچنين", "كردند", "داده", "بوده",
            "دارند", "همين", "ميليون", "سوي", "شوند", "بيشتر", "بسيار",
            "روي", "گرفته", "هايي", "تواند", "اول", "نام", "هيچ", "چند",
            "جديد", "بيش", "شدن", "كردن", "كنيم", "نشان", "حتي", "اينكه",
            "ولی", "توسط", "چنين", "برخي", "نه", "ديروز", "دوم",
            "درباره", "بعد", "مختلف", "گيرد", "شما", "گفته", "آنان",
            "بار", "طور", "گرفت", "دهند", "گذاري", "بسياري", "طي",
            "بودند", "ميليارد", "بدون", "تمام", "كل", "تر",
            "براساس", "شدند", "ترين", "امروز", "باشند", "ندارد",
            "چون", "قابل", "گويد", "ديگري", "همان", "خواهند",
            "قبل", "آمده", "اكنون", "تحت", "طريق", "گيري", "جاي",
            "هنوز", "چرا", "البته", "كنيد", "سازي", "سوم", "كنم",
            "بلكه", "زير", "توانند", "ضمن", "فقط", "بودن", "حق",
            "آيد", "وقتي", "اش", "يابد", "نخستين", "مقابل", "خدمات",
            "امسال", "تاكنون", "مانند", "تازه", "آورد", "فكر",
            "آنچه", "نخست", "نشده", "شايد", "چهار", "جريان",
            "پنج", "ساخته", "زيرا", "نزديك", "برداري", "كسي",
            "ريزي", "رفت", "گردد", "مثل", "آمد", "ام", "بهترين",
            "دانست", "كمتر", "دادن", "تمامي", "جلوگيري",
            "بيشتري", "ايم", "ناشي", "چيزي", "آنكه", "بالا",
            "بنابراين", "ايشان", "بعضي", "دادند", "داشتند",
            "برخوردار", "نخواهد", "هنگام", "نبايد", "غير", "نبود",
            "ديده", "وگو", "داريم", "چگونه", "بندي", "خواست", "فوق", "ده",
            "نوعي", "هستيم", "ديگران", "همچنان", "سراسر", "ندارند",
            "گروهي", "سعي", "روزهاي", "آنجا", "يكديگر", "كردم",
            "بيست", "بروز", "سپس", "رفته", "آورده", "نمايد",
            "باشيم", "گويند", "زياد", "خويش", "همواره", "گذاشته",
            "شش", "نداشته", "شناسي", "خواهيم", "آباد", "داشتن",
            "نظير", "همچون", "باره", "نكرده", "شان", "سابق",
            "هفت", "دانند", "جايي", "بی", "جز", "زیرِ", "رویِ",
            "سریِ", "تویِ", "جلویِ", "پیشِ", "عقبِ", "بالایِ",
            "خارجِ", "وسطِ", "بیرونِ", "سویِ", "کنارِ", "پاعینِ",
            "نزدِ", "نزدیکِ","دنبالِ", "حدودِ", "برابرِ", "طبقِ",
            "مانندِ", "ضدِّ", "هنگامِ", "برایِ", "مثلِ", "بارة",
            "اثرِ", "تولِ", "علّتِ", "سمتِ", "عنوانِ", "قصدِ",
            "روب", "جدا", "کی", "که", "چیست", "هست", "کجا", "کجاست",
            "کَی", "چطور", "کدام", "آیا", "مگر", "چندین",
            "یک", "چیزی", "دیگر", "کسی", "بعری", "هیچ", "چیز",
            "جا", "کس", "هرگز", "یا", "تنها", "بلکه", "خیاه",
            "بله", "بلی", "آره", "آری", "مرسی", "البتّه",
            "لطفاً", "ّه", "انکه",
            "وقتیکه", "همین", "پیش", "مدّتی", "هنگامی", "مان", "تان"
            ];
        $page = preg_replace('/\b('.implode('|',$stop_words).')\b/u', '',
            mb_strtolower($page));
        return $page;
    }
    /**
     * Computes the stem of a Persian word
     *
     * @param string $word the string to stem
     * @return string the stem of $word
     */
    public static function stem($word)
    {
        if (in_array($word, self::$no_stem_list)) {
            return $word;
        }
        $word = mb_strtolower($word);
        $word = self::removeKasra($word);
        $word = self::removeSuffix($word);
        $word = self::removeKasra($word);
        return $word;
    }
    /**
     * Removes a Kasra diacritic mark if appears
     * at the end of a word.
     * @param string $word word to remove mark from
     * @return string result of removal
     */
    private static function removeKasra($word)
    {
        if(mb_strlen($word) < 5) {
            return $word;
        }
        $kasra = json_decode('"\u0650"');
        $word = preg_replace('/'.$kasra.'$/u', "", $word);
        return $word;
    }
    /**
     * Removes common Persian suffixes
     *
     * @param string $word to remove suffixes from
     * @return string result of suffix removal
     */
    private static function removeSuffix($word)
    {
        $length = mb_strlen($word);
        if ($length > 7) {
            $modified_word = preg_replace("/(?:
                آباد | باره | بندی | بندي | ترین | ترين | ریزی |
                ريزي | سازی | سازي | گیری | گيري | هایی | هايي
                ) $/xu", "", $word);
            if($modified_word != $word) {
                return $modified_word;
            }
        }
        if ($length > 6) {
            $modified_word = preg_replace("/(?:
                    اند | ایم | ايم | شان | های | هاي
                ) $/xu", "", $word);
            if($modified_word != $word) {
                return $modified_word;
            }
        }
        if ($length > 5) {
            $modified_word = preg_replace("/ ان $/xu", "", $word);
            if($modified_word != $word) {
                return self::normalize($word);
            }
            $modified_word = preg_replace("/(?:
                    ات | اش | ام | تر | را | ون | ها | هء | ین | ين
                ) $/xu", "", $word);
            if($modified_word != $word) {
                return $modified_word;
            }
        }
        if ($length > 3) {
            $modified_word = preg_replace("/(?: ت | ش | م | ه | ی | ي ) $/xu",
                "", $word);
            if($modified_word != $word) {
                return $modified_word;
            }
        }
        return $word;
    }
    /**
     * Performs additional end word stripping
     *
     * @param string $word to remove suffixes from
     * @return string result of suffix removal
     */
    private static function normalize($word)
    {
        $length = mb_strlen($word);
        if($length < 4) {
            return $word;
        }
        $modified_word = preg_replace("/(?: ت | ر | ش | گ | م | ى ) $/xu", "",
            $word);
        if($modified_word != $word) {
            $word = $modified_word;
            if(mb_strlen($word) < 4) {
                return $word;
            }
            $word = preg_replace("/(?: ی | ي ) $/xu", "", $word);
        }
        return $word;
    }
}

ViewGit