Last commit for locale/ar/resources/Tokenizer.php: 3767244d2bba0018926a97e51a67b136472f5f50

Changes for Version 3.0.1, a=chris

Chris Pollett [2015-08-27 19:Aug:th]

Changes for Version 3.0.1, a=chris

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2015  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * @author Chris Pollett chris@pollett.org
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2015
 * @filesource
 */
namespace seekquarry\yioop\locale\ar\resources;

/**
 * Arabic specific tokenization code. In particular, it has a stemmer,
 * The stemmer is my stab at porting Ljiljana Dolamic (University of Neuchatel,
 * www.unine.ch/info/clef/) C stemming algorithm:
 * http://members.unine.ch/jacques.savoy/clef
 * That algorithm maps all stems to ASCII. Instead, I tried to leave everything
 * using Arabic characters.
 *
 * @author Chris Pollett
 * @package seek_quarry\locale\ar
 */
class Tokenizer
{
    /**
     * Words we don't want to be stemmed
     * @var array
     */
    public static $no_stem_list = [];
    /**
     * Stub function which could be used for a word segmenter.
     * Such a segmenter on input thisisabunchofwords would output
     * this is a bunch of words
     *
     * @param string $pre_segment  before segmentation
     * @return string should return string with words separated by space
     *     in this case does nothing
     */
    public static function segment($pre_segment)
    {
        return $pre_segment;
    }
    /**
     * Removes the stop words from the page (used for Word Cloud generation)
     *
     * @param string $page the page to remove stop words from.
     * @return string $page with no stop words
     */
    public static function stopwordsRemover($page)
    {
        $stop_words = [
            "ا", "أ", "،", "عشر", "عدد", "عدة","عشرة",
            "عدم", "عام", "عاما", "عن", "عند", "عندما",
            "على", "عليه", "عليها", "زيارة", "سنة", "سنوات",
            "تم", "ضد", "بعد", "بعض", "اعادة", "اعلنت",
            "بسبب", "حتى", "اذا", "احد", "اثر", "برس",
            "باسم", "غدا", "شخصا", "صباح", "اطار",
            "اربعة", "اخرى", "بان", "اجل", "غير",
            "بشكل", "حاليا", "بن", "به", "ثم", "اف",
            "ان", "او", "اي", "بها", "صفر", "حيث",
            "اكد", "الا", "اما", "امس", "السابق",
            "التى", "التي", "اكثر", "ايار", "ايضا",
            "ثلاثة", "الذاتي", "الاخيرة", "الثاني",
            "الثانية", "الذى", "الذي", "الان", "امام",
            "ايام", "خلال", "حوالى", "الذين", "الاول",
            "الاولى", "بين", "ذلك", "دون", "حول", "حين",
            "الف", "الى", "انه", "اول", "ضمن", "انها",
            "جميع", "الماضي", "الوقت", "المقبل", "اليوم",
            "ـ", "ف", "و", "و6", "قد", "لا", "ما", "مع",
            "مساء", "هذا", "واحد", "واضاف", "واضافت",
            "فان", "قبل", "قال", "كان", "لدى", "نحو",
            "هذه", "وان", "واكد", "كانت", "واوضح",
            "مايو", "فى", "في", "كل", "لم", "لن", "له",
            "من", "هو", "هي", "قوة", "كما", "لها", "منذ",
            "وقد", "ولا", "نفسه", "لقاء", "مقابل", "هناك",
            "وقال", "وكان", "نهاية", "وقالت", "وكانت",
            "للامم", "فيه", "كلم", "لكن", "وفي", "وقف",
            "ولم", "ومن", "وهو", "وهي", "يوم", "فيها",
            "منها", "مليار", "لوكالة", "يكون", "يمكن",
            "مليون"
        ];
        $page = preg_replace('/\b('.implode('|',$stop_words).')\b/u', '',
            $page);
        return $page;
    }
    /**
     * Computes the stem of an Arabic word
     *
     * @param string $word the string to stem
     * @return string the stem of $word
     */
    public static function stem($word)
    {
        if (in_array($word, self::$no_stem_list)) {
            return $word;
        }
        $word = mb_strtolower($word);
        $word = self::removeModifiersAndArchaic($word);
        $word = self::removeSuffix($word);
        $word = self::removePrefix($word);
        return $word;
    }
    /**
     * Removes common letter modifiers as well as some archaic characters
     * @param string $word
     * @return string the $word after letter modifiers removed
     */
    private static function removeModifiersAndArchaic($word)
    {
        $modifiers = ["\u0621", "\u0640", "\u064B", "\u064C", "\u064D",
            "\u064E", "\u064F", "\u0650", "\u0651", "\u0652", "\u0653",
            "\u0654", "\u0655", "\u0656", "\u0674", "\u0677", "\u0679"];
        foreach($modifiers as $modifier) {
            $m = json_decode('"'.$modifier.'"');
            $word = preg_replace("/".$m."/u", "", $word);
        }
        return $word;
    }
    /**
     * Removes Arabic suffixes to get root
     *
     * @param string $word word to remove suffixes from
     * @return string the $word after suffix removal
     */
    private static function removeSuffix($word)
    {
        $length = mb_strlen($word);
        if ($length > 5) {
            $last_two = mb_substr($word, -2);
            if(in_array($last_two,["ون", "آه", "أه", "آت", "أت", "آن",
                "أن", "ىة", "ىه", "ىن", "ةئ", "ئن", "ئه"])) {
                $word = mb_substr($word, 0, -2);
                return $word;
            }
        }
        if ($length > 4) {
            $last_one = mb_substr($word, -1);
            if(in_array($last_one, ["ى", "ۃ", "ه", "ئ"])) {
                $word = mb_substr($word, 0, -1);
                return $word;
            }
        }
        return $word;
    }
    /**
     * Removes Arabic prefixes to get root
     * @param string $word word to remove prefixes from
     *
     * @return string the $word after prefix removal
     */
    private static function removePrefix($word)
    {
        $length = mb_strlen($word);
        if ($length > 6) {
            $start_three = mb_substr($word, 0, 3);
            if(in_array($start_three, ["ڡآل", "ۀآل", "بآل", "وآل",
                "ڡأل", "ۀأل", "بأل", "وأل"])) {
                $word = mb_substr($word, 3);
                return $word;
            }
        }
        if ($length > 5) {
            $start_two = mb_substr($word, 0, 2);
            if(in_array($start_two, ["آل", "أل"])) {
                $word = mb_substr($word, 2);
                return $word;
            }
        }
        if ($length > 4) {
            $start_one = mb_substr($word, 0, 1);
            if(in_array($start_one, ["و", "ى", "ٸ"])) {
                $word = mb_substr($word, 1);
                return $word;
            }
        }
        return $word;
    }
}

ViewGit