Last commit for src/locale/el_GR/resources/Tokenizer.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\locale\el_GR\resources;

/**
 * Greek specific tokenization code. Contains a list of greek stop words
 * used in making word clouds. It also has a greek stemmer.
 * This stemmer is based on the algorithms described in
 * Ntais, Georgios. Development of a Stemmer for the Greek Language.
 * Diss. Royal Institute of Technology, 2006.
 * and
 * Saroukos, Spyridon. Enhancing a Greek language stemmer.
 * University of Tampere, 2008.
 * From here I looked at the implementation given at:
 * https://snowballstem.org/algorithms/greek/stemmer.html
 * In particular, I looked at the Snowball code, the Javascript Demo code, and
 * the PHP code (GPLv3) in:
 * https://git.drupalcode.org/project/greekstemmer
 * Copyright (c) 2009 Vassilis Spiliopoulos (http://www.psychfamily.gr)
 * Updated by Yannis Karampelas (info@netstudio.gr) in 2011 and 2017
 * respectively based on earlier work Spyros Saroukos into Drupal CMS.
 *
 * The code below is largely a complete rewrite to make this work in UTF-8
 * lower case Greek rather than use upper case iso-8859-7 as the file encoding.
 * Most of the repetitive code has been refactored into a method regexStem
 * which is repeatedly called with different regex expressions.
 *
 * @author Chris Pollett
 */
class Tokenizer
{
    /**
     * Words we don't want to be stemmed
     * @var array
     */
    public static $no_stem_list = [];
    /**
     * A list of frequently occurring terms for this locale which should
     * be excluded from certain kinds of queries For greek,
     * took the top 250 words from
     * https://en.wiktionary.org/
     *  wiki/Wiktionary:Frequency_lists/Greek_wordlist#1-250
     * @var array
     */
    public static $stop_words = [
        'http', 'https', "να", "το", "δεν", "είναι", "θα", "και", "μου",
        "με", "για", "την", "σου", "τον", "τα", "που", "σε", "τι", "του",
        "αυτό", "ότι", "στο", "από", "της", "τη", "όχι", "ναι", "αν", "ένα",
        "τους", "εδώ", "μια", "αλλά", "μας", "είσαι", "σας", "ήταν", "πρέπει",
        "είμαι", "κι", "οι", "στην", "πολύ", "γιατί", "δε", "εγώ", "πως",
        "τώρα", "εντάξει", "ξέρω", "κάτι", "τις", "έχει", "έχω", "εσύ", "μην",
        "θέλω", "καλά", "έτσι", "στη", "στον", "αυτή", "ξέρεις", "κάνεις",
        "έχεις", "όταν", "μπορώ", "μόνο", "εκεί", "σαν", "μαζί", "πώς",
        "τίποτα", "κάνω", "όλα", "ευχαριστώ", "μπορεί", "κάνει", "ποτέ",
        "απ", "τόσο", "στα", "αυτά", "πού", "πάμε", "μέσα", "των", "μπορείς",
        "πιο", "υπάρχει", "ακόμα", "απλά", "έλα", "έχουμε", "αυτός", "σπίτι",
        "λοιπόν", "είμαστε", "τότε", "πίσω", "παρακαλώ", "μετά", "πριν", "ίσως",
        "λίγο", "νομίζω", "κύριε", "γεια", "ένας", "πάντα", "πω", "ποιος",
        "δουλειά", "μη", "δω", "λες", "αλήθεια", "όπως", "παιδιά", "όλοι",
        "είπε", "γι", "θέλεις", "άλλο", "δύο", "ας", "ζωή", "είχε", "έναν",
        "κάνουμε", "πάω", "οχι", "ωραία", "καλό", "είπα", "θες", "πες", "στις",
        "κοίτα", "πάνω", "έξω", "σένα", "χρόνια", "ώρα", "έχουν", "ούτε",
        "μία", "μα", "κάτω", "μένα", "φορά", "μέρα", "ήμουν", "κάποιος",
        "έπρεπε", "κάθε", "μέχρι", "κανείς", "καλή", "όμως", "επειδή",
        "γυναίκα", "πράγματα", "είστε", "είχα", "χωρίς", "ήθελα", "σωστά",
        "θέλει", "μαμά", "μπορούμε", "μόλις", "δυο", "πάει", "λέει", "θεέ",
        "πας", "καλύτερα", "ειναι", "σήμερα", "έγινε", "έκανε", "ακριβώς",
        "πόσο", "συγγνώμη", "πεις", "αρέσει", "έκανα", "συμβαίνει", "λυπάμαι",
        "πολλά", "φαίνεται", "www", "πρόβλημα", "εμένα", "είπες", "κάποιον",
        "στιγμή", "αυτόν", "λάθος", "μέρος", "γίνει", "όσο", "λένε", "λεφτά",
        "περίμενε", "χρόνο", "παιδί", "άλλη", "βλέπω", "πράγμα", "απο",
        "εσένα", "έκανες", "φυσικά", "δικό", "ήσουν", "γρήγορα", "πάλι",
        "στους", "πιστεύω", "κάποια", "ως", "φίλε", "οπότε", "μάλλον", "πάρω",
        "μπαμπά", "γίνεται", "λέω", "έχετε", "υπάρχουν", "ξέρει", "ιδέα",
        "χρειάζεται", "όλο", "ίδιο", "πήγαινε", "νομίζεις", "σίγουρα", "οτι",
        "συγνώμη", "πάρει", "μωρό", "εσείς", "νέα", "όλη", "μητέρα", "σημαίνει",
        "φορές", "εμείς", "είδα"
    ];
    /**
     * Associative array of suffixes to replace with simplified suffixes.
     * Used in @see regexStem
     * @var array
     */
    public static $suffix_patterns = [
        "φαγια" => "φα", "φαγιου" => "φα", "φαγιου" => "φα",
        "σκαγια" => "σκα", "σκαγιου" => "σκα", "σκαγιων" => "σκα",
        "ολογιου" => "ολο", "ολογια" => "ολο", "ολογιων" => "ολο",
        "σογιου" => "σο", "σογια" => "σο", "σογιων" => "σο",
        "τατογια" => "τατο", "τατογιου" => "τατο", "τατογιων" => "τατο",
        "κρεας" => "κρε", "κρεατος" => "κρε",
        "κρεατα" => "κρε", "κρεατων" => "κρε",
        "περας" => "περ", "περατος" => "περ",
        "περατη" => "περ", //added by spyros . also in step1 regex
        "περατα"=> "περ", "περατων" => "περ",
        "τερας" => "τερ", "τερατος" => "τερ", "τερατα" => "τερ",
        "τερατων" => "τερ",
        "φως" => "φω", "φωτος" => "φω", "φωτα" => "φω", "φωτων" => "φω",
        "καθεστως" => "καθεστ", "καθεστωτος" => "καθεστ",
        "καθεστωτα" => "καθεστ", "καθεστωτων" => "καθεστ",
        "γεγονος" => "γεγον", "γεγονοτος" => "γεγον",
        "γεγονοτα" => "γεγον", "γεγονοτων" => "γεγον"
    ];
    /**
     * This is a list of hard-coded stems. I got the test file (90000 plus
     * terms) on the  snowball site to work except for this list, so I brute
     * forced it. My suspicion why all cases didn't work is something to do with
     * my diacritic mark handling.
     *
     * @var array
     */
    public static $dictionary_stems = [
        "αιθυλεστέρας" => "αιθυλ", "αμόρφωτος" => "αμορφω",
        "ανανεώθηκαν" => "αν", "ανανεώθηκε" => "αν",
        "αντιστρόφως" => "αντιστροφω", "ανωτέρας" => "αν",
        "αριστεράς" => "αριστερ", "ασαφώς" => "ασαφω",
        "αστέρας" => "αστερ", "βαλίτσα" => "βαλιτσ", "βαλίτσας" => "βαλιτσ",
        "βαλίτσες" => "βαλιτσ", "γεγονος" => "γεγον", "γεγονός" => "γεγον",
        "γεγονότος" => "γεγον", "γιαγιάδες" => "γιαγ", "δευτέρας" => "δε",
        "διαιωνίζει" => "διαι", "διαιωνίζουν" => "διαι",
        "εγγράφως" => "εγγραφω", "επτάφωτος" => "επταφω", "εσπέρας" => "εσπερ",
        "εσωτερισμού" => "εσ", "θυγατέρας" => "θυγατερ",
        "ισοπροπυλεστέρας" => "ισοπροπυλ", "καθεστώς" => "καθεστ",
        "καθεστώτος" => "καθεστ", "καλντέρας" => "καλντερ",
        "κεντροαριστεράς" => "κεντροαριστερ", "κρέας" => "κρε",
        "κρέατος" => "κρε", "κυράδες" => "κυρ", "λυκόφως" => "λυκοφω",
        "λυκόφωτος" => "λυκοφω", "μαμάδες" => "μαμ",
        "μεθυλεστέρας" => "μεθυλ", "μητέρας" => "μητερ", "νεωτέρας" => "νε",
        "νεωτερισμοί" => "νε", "νεωτερισμούς" => "νε", "νεωτερισμό" => "νε",
        "νεωτερισμός" => "νε", "νεωτεριστές" => "νε", "νεωτεριστής" => "νε",
        "νταντάδες" => "νταντ", "νυφίτσα" => "νυφιτσ", "νυφίτσες" => "νυφιτσ",
        "οκάδες" => "οκ", "ολογράφως" => "ολογραφω", "πάγκρεας" => "παγκρε",
        "πέρας" => "περ", "πίτσα" => "πιτσ", "πίτσας" => "πιτσ",
        "πίτσες" => "πιτσ", "παγκρέατος" => "παγκρε", "πατέρας" => "πατερ",
        "πατεράδες" => "πατερ",  "πατερίτσες" => "πατεριτσ",
        "πατερας" => "πατερ", "πολυεστέρας" => "πολυ",
        "προπυλεστέρας" => "προπυλ", "σαπουνόπερας" => "σαπουνοπερ",
        "σαράκι" => "σαρακ", "σαφώς" => "σαφω", "σιωνιστές" => "σ",
        "σφαγίων" => "σφα", "τέρας" => "τερ", "τέρατος" => "τερ",
        "φαινυλεστέρας" => "φαινυλ", "φως" => "φω", "φωτός" => "φω",
        "φώς" => "φω", "όπερας" => "οπερ",
    ];
    /**
     * A map from lower case Greek letters with or without diacritic marks to
     * to lower case Greek Letters some that keep their marks, some that don't
     * @var array
     */
    public static $letter_map = [
        "α" => "α", "β" => "β", "γ" => "γ", "δ" => "δ", "ε" => "ε", "ζ" => "ζ",
        "η" => "η", "θ" => "θ", "ι" => "ι", "κ" => "κ", "λ" => "λ", "μ" => "μ",
        "ν" => "ν", "ξ" => "ξ", "ο" => "ο", "π" => "π", "ρ" => "ρ", "σ" => "σ",
        "τ" => "τ", "υ" => "υ", "φ" => "φ", "χ" => "χ", "ψ" => "ψ", "ω" => "ω",
        "ά" => "α", "ὰ" => "ὰ", "ᾶ" => "ᾶ", "ἀ" => "ἀ", "ἂ" => "ἂ", "ἄ" => "ἄ",
        "ἃ" => "ἃ", "έ" => "ε", "ὲ" => "ὲ", "ἑ" => "ἑ", "ἐ" => "ἐ",
        "ἕ" => "ἕ", "ἓ" => "ἓ", "ἔ" => "ἔ", "ή" => "η", "ὴ" => "ὴ",
        "ῆ"=> "ῆ", "ῇ" => "ῇ", "ἡ" => "ἡ", "ἣ" => "ἣ", "ἧ" => "ἧ", "ἦ" => "ἦ",
        "ἢ" => "ἢ", "ἤ" => "ἤ", "ό" => "ο", "ὸ" => "ὸ", "ὁ" => "ὁ", "ὅ" => "ὅ",
        "ὃ" => "ὃ", "ὄ" => "ὄ", "ύ" => "υ", "ὺ" => "ὺ", "ϋ" => "υ", "ῦ" => "ῦ",
        "ὔ" => "ὔ", "ΰ" => "υ", "ὑ"=> "ὑ", "ὐ" => "ὐ", "ὖ" => "ὖ", "ῡ" => "ῡ",
        "ὕ" => "ὕ", "ὗ" => "ὗ", "ς" => "σ", "ώ" => "ω", "ὡ"=> "ὡ", "ῶ" => "ῶ",
        "ὥ" => "ὥ", "ὼ" => "ὼ", "ῳ"=> "ῳ", "ὧ"=> "ὧ", "ῷ" =>"ῷ", "ᾧ" => "ᾧ",
        "ὦ" => "ὦ", "ί" => "ι", "ὶ" => "ὶ", "ϊ" => "η", "ῖ"=> "ῖ", "ΐ" => "η",
        "ἱ" => "ἱ", "ἰ" => "ἰ", "ἶ" => "ἶ", "ἷ" => "ἷ", "ἴ" => "ἴ", "ἵ" => "ἵ",
        "΄" => "΄",
    ];
    /**
     * Used to track which step in the stemming process resulted in th
     * stem which is eventually output (typically, only used by unit tester)
     * @var array
     */
    public static $stem_step;
    /**
     * This method currently does nothing. For some locales it could
     * used to split strings of the form "thisisastring" into a string
     * with the words separated: "this is a string"
     *
     * @param string $pre_segment string to be segmented
     * @return string after segmentation done (same string in this case)
     */
    public static function segment($pre_segment)
    {
        return $pre_segment;
    }
    /**
     * Computes the stem of a Greek word. The document level comments for this
     * class has references to the particular algorithm used.
     *
     * @param string $word is the word to be stemmed
     * @return string stem of $word
     */
    public static function stem($word)
    {
        self::$stem_step = 0;
        $vowels_with_y = '(α|ε|η|ι|ο|υ|ω)'; // vowels with upsilon
        $vowels_no_y = '(α|ε|η|ι|ο|ω)'; //vowels without upsilon
        $original_word = $word;
        /*
           Checks if word exists in list of hard-coded stems, if so
           return the stem from there, rather than use algorithm to compute
           stem
         */
        if (!empty(self::$dictionary_stems[$word])) {
            return self::$dictionary_stems[$word];
        }
        /* The following pattern matches words that should stem to the
          empty string which should then be returned */
        if (empty(preg_replace('/^(έως|ήσαν|ήσουν|ίδια|ίδιο|ίδιων|ίσα|ίσαμε|' .
             'ίσες|αγά|είς|είτε|όταν|εστέρα|εστέρες|εστέρων|εως|εώς|' .
             'αεί|εις|ιδιο|ιδία|ικό|ιού|ιστοί|ιστού|ιστούς|ιστό|ιστός|ιστών|' .
             'ιών|οταν|ουν|ους|ουσ|ούσα|όντας|εστέρας|ιδίων|ιδια)$/',
             '', $word))) {
            return "";
        }
        /*
           The remainder of stemming consists of a two rule groups. The first
           group of rules are executed in sequence where if a rule is applied
           then the stem is immediately returned from the function.
           The second group of rules is executed in sequence and the final
           result of the complete sequence yields a stem to return.
         */
        self::$stem_step++;
        $word = self::unmarkLetters($word);
        if (mb_strlen($word) < 3) {
            return $word;
        }
        //Group 1 Rules
        //Step S1. 14 stems
        if (self::regexStem($word,
            '/^(.+?)(ιζα|ιζες|ιζε|ιζαμε|ιζατε|ιζαν|ιζανε|ιζω|ιζεις|ιζει|'.
                'ιζουμε|ιζετε|ιζουν|ιζουνε)$/',
            ['ι' => '/^(αναμπα|εμπα|επα|ξαναπα|πα|περιπα|αθρο|συναθρο|δανε)$/',
            'ιζ'=> '/^(μαρκ|κορν|αμπαρ|αρρ|βαθυρι|βαρκ|β|βολβορ|γκρ|γλυκορ|' .
                'γλυκυρ|ιμπ|λ|λου|μαρ|μ|πρ|μπρ|πολυρ|π|ρ|πιπερορ)$/'],
            'dummy', false, true)) {
            return $word;
        }
        //Step S2. 7 stems
        if (self::regexStem($word,
            '/^(.+?)(ωθηκα|ωθηκες|ωθηκε|ωθηκαμε|ωθηκατε|ωθηκαν|ωθηκανε)$/',
            '/^(αλ|βι|εν|υψ|λι|ζω|σ|χ)$/', 'ων', true, true)) {
            return $word;
        }
        //Step S3. 7 stems
        if (self::regexStem($word,
            '/^(.+?)(ισα|ισες|ισε|ισαμε|ισατε|ισαν|ισανε)$/',
            ['ι' => '/^(αναμπα|αθρο|εμπα|εσε|εσωκλε|επα|ξαναπα|επε|περιπα|'.
                'αθρο|συναθρο|δανε|κλε|χαρτοπα|εξαρχα|μετεπε|αποκλε|απεκλε|'.
                'εκλε|πε|περιπα)$/',
            "ισ" => '/^(αν|αφ|γε|γιγαντοαφ|γκε|δημοκρατ|κομ|γκ|μ|π|' .
                'πουκαμ|ολο|λαρ)$/'], 'dummy', false, true)) {
            return $word;
        }
        //Step S4. 7 stems
        if (self::regexStem($word,
            '/^(.+?)(ισω|ισεις|ισει|ισουμε|ισετε|ισουν|ισουνε)$/',
            '/^(αναμπα|εμπα|εσε|εσωκλε|επα|ξαναπα|επε|περιπα|αθρο|'.
                'συναθρο|δανε|κλε|χαρτοπα|εξαρχα|μετεπε|αποκλε|απεκλε|'.
                'εκλε|πε|περιπα)$/', 'ι', true, true)) {
            return $word;
        }
        //Step S5. 11 stems
        if (self::regexStem($word,
            '/^(.+?)(ιστος|ιστου|ιστο|ιστε|ιστοι|ιστων|ιστους|ιστη|'.
                'ιστης|ιστα|ιστες)$/',
            ['ιστ' => '/^(μ|π|απ|αρ|ηδ|κτ|σκ|σχ|υψ|φα|χρ|χτ|ακτ|αορ|ασχ|'.
                'ατα|αχν|αχτ|γεμ|γυρ|εμπ|ευπ|εχθ|ηφα|ήφα|καθ|κακ|κυλ|λυγ|'.
                'μακ|μεγ|ταχ|φιλ|χωρ)$/',
            'ι'=> '/^(δανε|συναθρο|κλε|σε|εσωκλε|ασε|πλε)$/'], 'dummy',
            false, true)) {
            return $word;
        }
        //Step S6. 6 stems
        if (preg_match('/^(.+?)(ισμο|ισμοι|ισμος|ισμου|ισμους|ισμων)$/ui',
            $word, $match)) {
            $word  = $match[1];
            if (preg_match('/^(αγνωστικ|ατομικ|γνωστικ|εθνικ|εκλεκτικ|'.
                'σκεπτικ|τοπικ)$/ui', $word) ) {
                $word = mb_ereg_replace('ικ', "", $word);
            }
            if (preg_match('/^(σε|μετασε|μικροσε|εγκλε|αποκλε)$/ui', $word)) {
                $word .= "ισμ";
            }
            if (preg_match('/^(δανε|αντιδανε)$/ui', $word)) {
                $word .= "ι";
            }
            if (preg_match('/^(αλεξανδριν|βυζαντιν|θεατριν)$/ui', $word)) {
                $word = mb_ereg_replace('ιν', "", $word);
            }
            return $word;
        }
        //Step S7. 4 stems
        if (self::regexStem($word,
            '/^(.+?)(αρακι|αρακια|ουδακι|ουδακια)$/', '/^(σ|χ)$/', "aρακ")) {
            return $word;
        }
        //Step S8. 8 stems
        if (self::regexStem($word,
            '/^(.+?)(ακι|ακια|ιτσα|ιτσας|ιτσες|ιτσων|αρακι|αρακια)$/',
            ["ακ" => '/^(ανθρ|βαμβ|βρ|καιμ|κον|κορ|λαβρ|λουλ|μερ|μουστ|'.
                'ναγκας|πλ|ρ|ρυ|σ|σκ|σοκ|σπαν|τζ|φαρμ|χ|καπακ|αλισφ|αμβρ|'.
                'ανθρ|κ|φυλ|κατραπ|κλιμ|μαλ|σλοβ|φ|σφ|τσεχοσλοβ)$/',
            "ιτσ" => '/^(β|βαλ|γιαν|γλ|ζ|ηγουμεν|καρδ|κον|μακρυν|νυφ|πατερ|π|'.
                'σκ|τος|τριπολ)$/',
            "ιτσ" => '/(κορ)$/'], "dummy", false)) {
            return $word;
        }
        //Step S9. 3 stems
        if (self::regexStem($word, '/^(.+?)(ιδιο|ιδια|ιδιων)$/',
            ['/^(αιφν|ιρ|ολο|ψαλ)$/', '/(ε|παιχν)$/'], "ιδ", false)) {
            return $word;
        }
        //Step S10. 4 stems
        if (self::regexStem($word, '/^(.+?)(ισκος|ισκου|ισκο|ισκε)$/',
            '/^(δ|ιβ|μην|ρ|φραγκ|λυκ|οβελ)$/', "ισκ") ) {
            return $word;
        }
        //Group 2 Rules
        //Step 1
        $stem_rule_applied = self::regexStem($word,
            '/(.*)(φαγια|φαγιου|φαγιων|σκαγια|σκαγιου|σκαγιων|ολογιου|'.
            'ολογια|ολογιων|σογιου|σογια|σογιων|τατογια|τατογιου|τατογιων|'.
            'κρεας|κρεατος|κρεατα|κρεατων|περας|περατος|περατη|περατα|'.
            'περατων|τερας|τερατος|τερατα|τερατων|φως|φωτος|φωτα|φωτων|'.
            'καθεστως|καθεστωτος|καθεστωτα|καθεστωτων|γεγονος|γεγονοτος|'.
            'γεγονοτα|γεγονοτων)$/', '', '', true, true);
        // Step 2a. 2 stems
        if (preg_match('/^(.+?)(αδες|αδων)$/', $word, $match)) {
            $word = $match[1];
            $regex = '/(οκ|μαμ|μαν|μπαμπ|πατερ|γιαγι|νταντ|κυρ|θει|πεθερ)$/';
            // note the !, (which is why didn't use regexStem)
            if (!preg_match($regex, $word)) {
                $word .= "αδ";
            }
        }
        //Step 2b. 2 stems
        self::regexStem($word, '/^(.+?)(εδες|εδων)$/',
            '/(οπ|ιπ|εμπ|υπ|γηπ|δαπ|κρασπ|μιλ)$/', 'εδ');
        //Step 2c
        self::regexStem($word, '/^(.+?)(ουδες|ουδων)$/',
            '/(αρκ|καλιακ|πεταλ|λιχ|πλεξ|σκ|σ|φλ|φρ|βελ|λουλ|' .
                'χν|σπ|τραγ|φε)$/', 'ουδ');
        //Step 2d
        $stem_rule_applied = $stem_rule_applied ||
            self::regexStem($word, '/^(.+?)(εως|εων)$/',
            '/^(θ|δ|ελ|γαλ|ν|π|ιδ|παρ)$/', 'ε');
        //Step 3
        $stem_rule_applied = $stem_rule_applied ||
            self::regexStem($word, '/^(.+?)(ια|ιου|ιων)$/',
            '/'.$vowels_with_y.'$/', 'ι');
        //Step 4
        $stem_rule_applied = $stem_rule_applied ||
            self::regexStem($word,
            '/^(.+?)(ικα|ικο|ικου|ικων)$/',
            ['/' . $vowels_with_y . '$/',
            '/^(αλ|αδ|ενδ|αμαν|αμμοχαλ|ηθ|ανηθ|αντιδ|φυς|βρωμ|γερ|'.
            'εξωδ|καλπ|καλλιν|καταδ|μουλ|μπαν|μπαγιατ|μπολ|μπος|νιτ|ξικ|'.
            'συνομηλ|πετς|πιτς|πικαντ|πλιατς|ποστελν|πρωτοδ|σερτ|συναδ|'.
            'τσαμ|υποδ|φιλον|φυλοδ|χας)$/'], 'ικ');
        //Step 5a
        if ($word == "αγαμε") {
            $word = "αγαμ";
        }
        $stem_rule_applied = $stem_rule_applied || self::regexStem($word,
            '/^(.+?)(αγαμε|ησαμε|ουσαμε|ηκαμε|ηθηκαμε)$/', '', '');
        $stem_rule_applied = $stem_rule_applied ||
            self::regexStem($word, '/^(.+?)(αμε)$/',
            '/^(αναπ|αποθ|αποκ|αποστ|βουβ|ξεθ|ουλ|πεθ|πικρ|ποτ|σιχ|χ)$/',
            "αμ");
        //Step 5b
        $stem_rule_applied = $stem_rule_applied || self::regexStem($word,
            '/^(.+?)(αγανε|ησανε|ουσανε|ιοντανε|'.
            'ιοτανε|ιουντανε|οντανε|οτανε|ουντανε|ηκανε|ηθηκανε)$/',
            '/^(τρ|τς)$/', "αγαν");
        $stem_rule_applied = $stem_rule_applied ||
            self::regexStem($word, '/^(.+?)(ανε)$/',
            ['/'.$vowels_no_y.'$/',
            '/^(βετερ|βουλκ|βραχμ|γ|δραδουμ|θ|καλπουζ|καστελ|κορμορ|' .
            'λαοπλ|μωαμεθ|μ|μουσουλμ|ν|ουλ|π|πελεκ|πλ|πολις|πορτολ|' .
            'σαρακατς|σουλτ|τσαρλατ|ορφ|τσιγγ|τσοπ|φωτοστεφ|χ|ψυχοπλ|' .
            'αγ|ορφ|γαλ|γερ|δεκ|διπλ|αμερικαν|ουρ|πιθ|πουριτ|σ|ζωντ|ικ|' .
            'καστ|κοπ|λιχ|λουθηρ|μαιντ|μελ|σιγ|σπ|στεγ|τραγ|τσαγ|φ|ερ|' .
            'αδαπ|αθιγγ|αμηχ|ανικ|ανοργ|απηγ|απιθ|ατσιγγ|βας|βασκ|βαθυγαλ|' .
            'βιομηχ|βραχυκ|διατ|διαφ|ενοργ|θυς|καπνοβιομηχ|καταγαλ|' .
            'κλιβ|κοιλαρφ|λιβ|μεγλοβιομηχ|μικροβιομηχ|νταβ|ξηροκλιβ|' .
            'ολιγοδαμ|ολογαλ|πενταρφ|περηφ|περιτρ|πλατ|πολυδαπ|πολυμηχ|' .
            'στεφ|ταβ|τετ|υπερηφ|υποκοπ|χαμηλοδαπ|ψηλοταβ)$/'], "αν");
        //Step 5c
        $stem_rule_applied = $stem_rule_applied ||
            self::regexStem($word, '/^(.+?)(ησετε)$/', '', '');
        $stem_rule_applied = $stem_rule_applied ||
            self::regexStem($word, '/^(.+?)(ετε)$/',
            ['/'.$vowels_no_y.'$/',
            '/(οδ|αιρ|φορ|ταθ|διαθ|σχ|ενδ|ευρ|τιθ|υπερθ|ραθ|ενθ|'.
                'ροθ|σθ|πυρ|αιν|συνδ|συν|συνθ|χωρ|πον|βρ|καθ|ευθ|εκθ|νετ|ρον|'.
                'αρκ|βαρ|βολ|ωφελ)$/',
            '/^(αβαρ|βεν|εναρ|αβρ|αδ|αθ|αν|απλ|βαρον|ντρ|σκ|κοπ|'.
                'μπορ|νιφ|παγ|παρακαλ|σερπ|σκελ|συρφ|τοκ|υ|δ|εμ|θαρρ|θ)$/'
            ], "ετ");
        //Step 5d
        $stem_rule_applied = $stem_rule_applied ||
            self::regexStem($word, '/^(.+?)(οντας|ωντας)$/',
            ["οντ" => '/^(αρχ)$/', "ωντ" => '/(κρε)$/']);
        //Step 5e
        $stem_rule_applied = $stem_rule_applied ||
            self::regexStem($word, '/^(.+?)(ομαστε|ιομαστε)$/',
            '/^(ον)$/', "ομαστ");
        //Step 5f
        $stem_rule_applied = $stem_rule_applied ||
            self::regexStem($word, '/^(.+?)(ιεστε)$/',
            '/^(π|απ|συμπ|ασυμπ|ακαταπ|αμεταμφ)$/', "ιεστ");
        $stem_rule_applied = $stem_rule_applied ||
            self::regexStem($word, '/^(.+?)(εστε)$/',
            '/^(αλ|αρ|εκτελ|ζ|μ|ξ|παρακαλ|αρ|προ|νις)$/', "εστ");
        //Step 5g
        $stem_rule_applied = $stem_rule_applied ||
            self::regexStem($word,
            '/^(.+?)(ηθηκα|ηθηκες|ηθηκε)$/', '','');
        $stem_rule_applied = $stem_rule_applied ||
            self::regexStem($word, '/^(.+?)(ηκα|ηκες|ηκε)$/',
            ['/(σκωλ|σκουλ|ναρθ|σφ|οθ|πιθ)$/',
            '/^(διαθ|θ|παρακαταθ|προσθ|συνθ|)$/'
            ], "ηκ");
        //Step 5h
        $stem_rule_applied = $stem_rule_applied || self::regexStem($word,
            '/^(.+?)(ουσα|ουσες|ουσε)$/', [
            '/^(φαρμακ|χαδ|αγκ|αναρρ|βρομ|εκλιπ|λαμπιδ|λεχ|μ|πατ|' .
                'ρ|λ|μεδ|μεσαζ|υποτειν|αμ|αιθ|ανηκ|δεσποζ|ενδιαφερ|δε|' .
                'δευτερευ|καθαρευ|πλε|τσα)$/',
            '/(ποδαρ|βλεπ|πανταχ|φρυδ|μαντιλ|μαλλ|κυματ|λαχ|ληγ|' .
                'φαγ|ομ|πρωτ)$/',
            ], "ουσ");
        //Step 5i
        $stem_rule_applied = $stem_rule_applied ||
            self::regexStem($word, '/^(.+?)(αγα|αγες|αγε)$/',
            ['/^(ψοφ|ναυλοχ)$/', '/(κολλ)$/',
            '/^(αβαστ|πολυφ|αδηφ|παμφ|ρ|ασπ|αφ|αμαλ|αμαλλι|ανυστ|' .
            'απερ|ασπαρ|αχαρ|δερβεν|δροσοπ|ξεφ|νεοπ|νομοτ|ολοπ|ομοτ|προστ|' .
            'προσωποπ|συμπ|συντ|τ|υποτ|χαρ|αειπ|αιμοστ|ανυπ|αποτ|αρτιπ|' .
            'διατ|εν|επιτ|κροκαλοπ|σιδηροπ|λ|ναυ|ουλαμ|ουρ|π|τρ|μ)$/',
            '/(οφ|πελ|χορτ|λλ|σφ|ρπ|φρ|πρ|λοχ|σμην)$/'], "αγ");
        //Step 5j
        $stem_rule_applied = $stem_rule_applied ||
            self::regexStem($word, '/^(.+?)(ησε|ησου|ησα)$/',
            '/^(ν|χερσον|δωδεκαν|ερημον|μεγαλον|επταν)$/', "ησ");
        //Step 5k
        $stem_rule_applied = $stem_rule_applied ||
            self::regexStem($word, '/^(.+?)(ηστε)$/',
            '/^(ασβ|σβ|αχρ|χρ|απλ|αειμν|δυσχρ|ευχρ|κοινοχρ|παλιμψ)$/', "ηστ");
        //Step 5la
        $stem_rule_applied = $stem_rule_applied ||
            self::regexStem($word,
            '/^(.+?)(ουνε|ησουνε|ηθουνε)$/',
            '/^(ν|ρ|σπι|στραβομουτς|κακομουτς|εξων)$/', "ουν");
        //Step 5lb
        $stem_rule_applied = $stem_rule_applied ||
            self::regexStem($word,
        '/^(.+?)(ουμε|ησουμε|ηθουμε)$/',
            '/^(παρασους|φ|χ|ωριοπλ|αζ|αλλοσους|ασους)$/', "ουμ");
        // Step 6
        self::regexStem($word, '/^(.+?)(ματα|ματων|ματος)$/', '', "μα");
        if (!$stem_rule_applied) {
            self::regexStem($word,
                '/^(.+?)(α|αγατε|αγαν|αει|αμαι|αν|ας|ασαι|αται|αω|ε|ει|εις|' .
                'ειτε|εσαι|ες|εται|ι|ιεμαι|ιεμαστε|ιεται|ιεσαι|ιεσαστε|' .
                'ιομασταν|ιομουν|ιομουνα|ιονταν|ιοντουσαν|ιοσασταν|ιοσαστε|'.
                'ιοσουν|ιοσουνα|ιοταν|ιουμα|ιουμαστε|ιουνται|ιουνταν|η|ηδες|'.
                'ηδων|ηθει|ηθεις|ηθειτε|ηθηκατε|ηθηκαν|ηθουν|ηθω|ηκατε|ηκαν|'.
                'ης|ησαν|ησατε|ησει|ησες|ησουν|ησω|ο|οι|ομαι|ομασταν|ομουν|'.
                'ομουνα|ονται|ονταν|οντουσαν|ος|οσασταν|οσαστε|οσουν|'.
                'οσουνα|οταν|ου|ουμαι|ουμαστε|ουν|ουνται|ουνταν|ους|ουσαν|'.
                'ουσατε|υ|υς|ω|ων)$/', '', '');
        }
        // Step 7 (ΠΑΡΑΘΕΤΙΚΑ)
        self::regexStem($word,
            '/^(.+?)(εστερ|εστατ|οτερ|οτατ|υτερ|υτατ|ωτερ|ωτατ)$/', '', '');
        return $word;
    }
    /**
     * Check is $word matches the regex $capture_stem_regex. If so
     * chenages $word to the capture group of that regex. It then
     * checks the regexes in $exception_regexes either in sequences or until
     * first match. If a match is found $word has a corresponding exception
     * stem added back to its end.
     *
     * @param string &$word term to be stemmed
     * @param string $capture_stem_regex a regex of format:
     * /^(stem_pattern)(suffix_pattern)$/ to check against $word. ui is added
     *  to the pattern before used to enable unicode.
     * @param mixed $exception_regexes either a string single exception
     *  suffix to look for or an array of suffixes to look for, or
     *  an associative array of items append_stem => exception_regex
     * @param string $exception_stem if $exception_regexes is not an
     *  associative array this should be the suffix to append to word if
     *  an exception_regex matches
     * @param bool $with_break if true, then the checking of $exception_regexes
     *  is only done till the first match is found. If false, all regexes
     *  are checked against
     * @param bool $use_suffix if true and $word watches $capture_stem_regex.
     * then suffix_pattern is looked up as a key in the map
     * self::$suffix_patterns. If found, the corresponding value is appendded
     * $word.
     * @return boolean whether word matched $capture_stem_regex
     */
    public static function regexStem(&$word, $capture_stem_regex,
        $exception_regexes, $exception_stem = "dummy", $with_break = true,
        $use_suffix = false)
    {
        self::$stem_step ??= 0;
        self::$stem_step++;
        if (preg_match($capture_stem_regex . "ui", $word, $match)) {
            $suffix = self::$suffix_patterns[$match[2]] ?? "";
            $suffix = ($use_suffix) ? $suffix : "";
            $word = $match[1] . $suffix;
            if (!empty($exception_stem)) {
                if (!is_array($exception_regexes)) {
                    $exception_regexes = [$exception_regexes];
                }
                foreach ($exception_regexes as
                    $potential_except_stem => $exception_regex) {
                    if (empty($exception_regex) ||
                        preg_match($exception_regex . "ui", $word)) {
                        if (is_int($potential_except_stem)) {
                            $word .= $exception_stem;
                        } else {
                            $word .= $potential_except_stem;
                        }
                        if ($with_break) {
                            break;
                        }
                    }
                }
            }
            return true;
        }
        return false;
    }
    /**
     * Removes the stop words from the page (used for Word Cloud generation)
     *
     * @param mixed $data either a string or an array of string to remove
     *      stop words from
     * @return mixed $data with no stop words
     */
    public static function stopwordsRemover($data)
    {
        static $pattern = "";
        if (empty($pattern)) {
            $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/ui';
        }
        $data = preg_replace($pattern, '', $data);
        return $data;
    }
    /**
     * Used to remove some diacritic marks from greek characters in a term
     *
     * @param string $word term to remove diacritic marks from
     * @return string with marks removed
     */
    static function unmarkLetters($word)
    {
        $start = 0;
        $len = mb_strlen($word);
        $out_word = "";
        for($i = 0; $i < $len; $i++) {
            $cur_char = mb_substr($word, $i, 1);
            $out_char = self::$letter_map[$cur_char] ?? $cur_char;
            $out_word .= $out_char;
        }
        return $out_word;
    }
}
ViewGit