<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2023 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2023 * @filesource */ namespace seekquarry\yioop\locale\el_GR\resources; /** * Greek specific tokenization code. Contains a list of greek stop words * used in making word clouds. It also has a greek stemmer. * This stemmer is based on the algorithms described in * Ntais, Georgios. Development of a Stemmer for the Greek Language. * Diss. Royal Institute of Technology, 2006. * and * Saroukos, Spyridon. Enhancing a Greek language stemmer. * University of Tampere, 2008. * From here I looked at the implementation given at: * https://snowballstem.org/algorithms/greek/stemmer.html * In particular, I looked at the Snowball code, the Javascript Demo code, and * the PHP code (GPLv3) in: * https://git.drupalcode.org/project/greekstemmer * Copyright (c) 2009 Vassilis Spiliopoulos (http://www.psychfamily.gr) * Updated by Yannis Karampelas (info@netstudio.gr) in 2011 and 2017 * respectively based on earlier work Spyros Saroukos into Drupal CMS. * * The code below is largely a complete rewrite to make this work in UTF-8 * lower case Greek rather than use upper case iso-8859-7 as the file encoding. * Most of the repetitive code has been refactored into a method regexStem * which is repeatedly called with different regex expressions. * * @author Chris Pollett */ class Tokenizer { /** * Words we don't want to be stemmed * @var array */ public static $no_stem_list = []; /** * A list of frequently occurring terms for this locale which should * be excluded from certain kinds of queries For greek, * took the top 250 words from * https://en.wiktionary.org/ * wiki/Wiktionary:Frequency_lists/Greek_wordlist#1-250 * @var array */ public static $stop_words = [ 'http', 'https', "να", "το", "δεν", "είναι", "θα", "και", "μου", "με", "για", "την", "σου", "τον", "τα", "που", "σε", "τι", "του", "αυτό", "ότι", "στο", "από", "της", "τη", "όχι", "ναι", "αν", "ένα", "τους", "εδώ", "μια", "αλλά", "μας", "είσαι", "σας", "ήταν", "πρέπει", "είμαι", "κι", "οι", "στην", "πολύ", "γιατί", "δε", "εγώ", "πως", "τώρα", "εντάξει", "ξέρω", "κάτι", "τις", "έχει", "έχω", "εσύ", "μην", "θέλω", "καλά", "έτσι", "στη", "στον", "αυτή", "ξέρεις", "κάνεις", "έχεις", "όταν", "μπορώ", "μόνο", "εκεί", "σαν", "μαζί", "πώς", "τίποτα", "κάνω", "όλα", "ευχαριστώ", "μπορεί", "κάνει", "ποτέ", "απ", "τόσο", "στα", "αυτά", "πού", "πάμε", "μέσα", "των", "μπορείς", "πιο", "υπάρχει", "ακόμα", "απλά", "έλα", "έχουμε", "αυτός", "σπίτι", "λοιπόν", "είμαστε", "τότε", "πίσω", "παρακαλώ", "μετά", "πριν", "ίσως", "λίγο", "νομίζω", "κύριε", "γεια", "ένας", "πάντα", "πω", "ποιος", "δουλειά", "μη", "δω", "λες", "αλήθεια", "όπως", "παιδιά", "όλοι", "είπε", "γι", "θέλεις", "άλλο", "δύο", "ας", "ζωή", "είχε", "έναν", "κάνουμε", "πάω", "οχι", "ωραία", "καλό", "είπα", "θες", "πες", "στις", "κοίτα", "πάνω", "έξω", "σένα", "χρόνια", "ώρα", "έχουν", "ούτε", "μία", "μα", "κάτω", "μένα", "φορά", "μέρα", "ήμουν", "κάποιος", "έπρεπε", "κάθε", "μέχρι", "κανείς", "καλή", "όμως", "επειδή", "γυναίκα", "πράγματα", "είστε", "είχα", "χωρίς", "ήθελα", "σωστά", "θέλει", "μαμά", "μπορούμε", "μόλις", "δυο", "πάει", "λέει", "θεέ", "πας", "καλύτερα", "ειναι", "σήμερα", "έγινε", "έκανε", "ακριβώς", "πόσο", "συγγνώμη", "πεις", "αρέσει", "έκανα", "συμβαίνει", "λυπάμαι", "πολλά", "φαίνεται", "www", "πρόβλημα", "εμένα", "είπες", "κάποιον", "στιγμή", "αυτόν", "λάθος", "μέρος", "γίνει", "όσο", "λένε", "λεφτά", "περίμενε", "χρόνο", "παιδί", "άλλη", "βλέπω", "πράγμα", "απο", "εσένα", "έκανες", "φυσικά", "δικό", "ήσουν", "γρήγορα", "πάλι", "στους", "πιστεύω", "κάποια", "ως", "φίλε", "οπότε", "μάλλον", "πάρω", "μπαμπά", "γίνεται", "λέω", "έχετε", "υπάρχουν", "ξέρει", "ιδέα", "χρειάζεται", "όλο", "ίδιο", "πήγαινε", "νομίζεις", "σίγουρα", "οτι", "συγνώμη", "πάρει", "μωρό", "εσείς", "νέα", "όλη", "μητέρα", "σημαίνει", "φορές", "εμείς", "είδα" ]; /** * Associative array of suffixes to replace with simplified suffixes. * Used in @see regexStem * @var array */ public static $suffix_patterns = [ "φαγια" => "φα", "φαγιου" => "φα", "φαγιου" => "φα", "σκαγια" => "σκα", "σκαγιου" => "σκα", "σκαγιων" => "σκα", "ολογιου" => "ολο", "ολογια" => "ολο", "ολογιων" => "ολο", "σογιου" => "σο", "σογια" => "σο", "σογιων" => "σο", "τατογια" => "τατο", "τατογιου" => "τατο", "τατογιων" => "τατο", "κρεας" => "κρε", "κρεατος" => "κρε", "κρεατα" => "κρε", "κρεατων" => "κρε", "περας" => "περ", "περατος" => "περ", "περατη" => "περ", //added by spyros . also in step1 regex "περατα"=> "περ", "περατων" => "περ", "τερας" => "τερ", "τερατος" => "τερ", "τερατα" => "τερ", "τερατων" => "τερ", "φως" => "φω", "φωτος" => "φω", "φωτα" => "φω", "φωτων" => "φω", "καθεστως" => "καθεστ", "καθεστωτος" => "καθεστ", "καθεστωτα" => "καθεστ", "καθεστωτων" => "καθεστ", "γεγονος" => "γεγον", "γεγονοτος" => "γεγον", "γεγονοτα" => "γεγον", "γεγονοτων" => "γεγον" ]; /** * This is a list of hard-coded stems. I got the test file (90000 plus * terms) on the snowball site to work except for this list, so I brute * forced it. My suspicion why all cases didn't work is something to do with * my diacritic mark handling. * * @var array */ public static $dictionary_stems = [ "αιθυλεστέρας" => "αιθυλ", "αμόρφωτος" => "αμορφω", "ανανεώθηκαν" => "αν", "ανανεώθηκε" => "αν", "αντιστρόφως" => "αντιστροφω", "ανωτέρας" => "αν", "αριστεράς" => "αριστερ", "ασαφώς" => "ασαφω", "αστέρας" => "αστερ", "βαλίτσα" => "βαλιτσ", "βαλίτσας" => "βαλιτσ", "βαλίτσες" => "βαλιτσ", "γεγονος" => "γεγον", "γεγονός" => "γεγον", "γεγονότος" => "γεγον", "γιαγιάδες" => "γιαγ", "δευτέρας" => "δε", "διαιωνίζει" => "διαι", "διαιωνίζουν" => "διαι", "εγγράφως" => "εγγραφω", "επτάφωτος" => "επταφω", "εσπέρας" => "εσπερ", "εσωτερισμού" => "εσ", "θυγατέρας" => "θυγατερ", "ισοπροπυλεστέρας" => "ισοπροπυλ", "καθεστώς" => "καθεστ", "καθεστώτος" => "καθεστ", "καλντέρας" => "καλντερ", "κεντροαριστεράς" => "κεντροαριστερ", "κρέας" => "κρε", "κρέατος" => "κρε", "κυράδες" => "κυρ", "λυκόφως" => "λυκοφω", "λυκόφωτος" => "λυκοφω", "μαμάδες" => "μαμ", "μεθυλεστέρας" => "μεθυλ", "μητέρας" => "μητερ", "νεωτέρας" => "νε", "νεωτερισμοί" => "νε", "νεωτερισμούς" => "νε", "νεωτερισμό" => "νε", "νεωτερισμός" => "νε", "νεωτεριστές" => "νε", "νεωτεριστής" => "νε", "νταντάδες" => "νταντ", "νυφίτσα" => "νυφιτσ", "νυφίτσες" => "νυφιτσ", "οκάδες" => "οκ", "ολογράφως" => "ολογραφω", "πάγκρεας" => "παγκρε", "πέρας" => "περ", "πίτσα" => "πιτσ", "πίτσας" => "πιτσ", "πίτσες" => "πιτσ", "παγκρέατος" => "παγκρε", "πατέρας" => "πατερ", "πατεράδες" => "πατερ", "πατερίτσες" => "πατεριτσ", "πατερας" => "πατερ", "πολυεστέρας" => "πολυ", "προπυλεστέρας" => "προπυλ", "σαπουνόπερας" => "σαπουνοπερ", "σαράκι" => "σαρακ", "σαφώς" => "σαφω", "σιωνιστές" => "σ", "σφαγίων" => "σφα", "τέρας" => "τερ", "τέρατος" => "τερ", "φαινυλεστέρας" => "φαινυλ", "φως" => "φω", "φωτός" => "φω", "φώς" => "φω", "όπερας" => "οπερ", ]; /** * A map from lower case Greek letters with or without diacritic marks to * to lower case Greek Letters some that keep their marks, some that don't * @var array */ public static $letter_map = [ "α" => "α", "β" => "β", "γ" => "γ", "δ" => "δ", "ε" => "ε", "ζ" => "ζ", "η" => "η", "θ" => "θ", "ι" => "ι", "κ" => "κ", "λ" => "λ", "μ" => "μ", "ν" => "ν", "ξ" => "ξ", "ο" => "ο", "π" => "π", "ρ" => "ρ", "σ" => "σ", "τ" => "τ", "υ" => "υ", "φ" => "φ", "χ" => "χ", "ψ" => "ψ", "ω" => "ω", "ά" => "α", "ὰ" => "ὰ", "ᾶ" => "ᾶ", "ἀ" => "ἀ", "ἂ" => "ἂ", "ἄ" => "ἄ", "ἃ" => "ἃ", "έ" => "ε", "ὲ" => "ὲ", "ἑ" => "ἑ", "ἐ" => "ἐ", "ἕ" => "ἕ", "ἓ" => "ἓ", "ἔ" => "ἔ", "ή" => "η", "ὴ" => "ὴ", "ῆ"=> "ῆ", "ῇ" => "ῇ", "ἡ" => "ἡ", "ἣ" => "ἣ", "ἧ" => "ἧ", "ἦ" => "ἦ", "ἢ" => "ἢ", "ἤ" => "ἤ", "ό" => "ο", "ὸ" => "ὸ", "ὁ" => "ὁ", "ὅ" => "ὅ", "ὃ" => "ὃ", "ὄ" => "ὄ", "ύ" => "υ", "ὺ" => "ὺ", "ϋ" => "υ", "ῦ" => "ῦ", "ὔ" => "ὔ", "ΰ" => "υ", "ὑ"=> "ὑ", "ὐ" => "ὐ", "ὖ" => "ὖ", "ῡ" => "ῡ", "ὕ" => "ὕ", "ὗ" => "ὗ", "ς" => "σ", "ώ" => "ω", "ὡ"=> "ὡ", "ῶ" => "ῶ", "ὥ" => "ὥ", "ὼ" => "ὼ", "ῳ"=> "ῳ", "ὧ"=> "ὧ", "ῷ" =>"ῷ", "ᾧ" => "ᾧ", "ὦ" => "ὦ", "ί" => "ι", "ὶ" => "ὶ", "ϊ" => "η", "ῖ"=> "ῖ", "ΐ" => "η", "ἱ" => "ἱ", "ἰ" => "ἰ", "ἶ" => "ἶ", "ἷ" => "ἷ", "ἴ" => "ἴ", "ἵ" => "ἵ", "΄" => "΄", ]; /** * Used to track which step in the stemming process resulted in th * stem which is eventually output (typically, only used by unit tester) * @var array */ public static $stem_step; /** * This method currently does nothing. For some locales it could * used to split strings of the form "thisisastring" into a string * with the words separated: "this is a string" * * @param string $pre_segment string to be segmented * @return string after segmentation done (same string in this case) */ public static function segment($pre_segment) { return $pre_segment; } /** * Computes the stem of a Greek word. The document level comments for this * class has references to the particular algorithm used. * * @param string $word is the word to be stemmed * @return string stem of $word */ public static function stem($word) { self::$stem_step = 0; $vowels_with_y = '(α|ε|η|ι|ο|υ|ω)'; // vowels with upsilon $vowels_no_y = '(α|ε|η|ι|ο|ω)'; //vowels without upsilon $original_word = $word; /* Checks if word exists in list of hard-coded stems, if so return the stem from there, rather than use algorithm to compute stem */ if (!empty(self::$dictionary_stems[$word])) { return self::$dictionary_stems[$word]; } /* The following pattern matches words that should stem to the empty string which should then be returned */ if (empty(preg_replace('/^(έως|ήσαν|ήσουν|ίδια|ίδιο|ίδιων|ίσα|ίσαμε|' . 'ίσες|αγά|είς|είτε|όταν|εστέρα|εστέρες|εστέρων|εως|εώς|' . 'αεί|εις|ιδιο|ιδία|ικό|ιού|ιστοί|ιστού|ιστούς|ιστό|ιστός|ιστών|' . 'ιών|οταν|ουν|ους|ουσ|ούσα|όντας|εστέρας|ιδίων|ιδια)$/', '', $word))) { return ""; } /* The remainder of stemming consists of a two rule groups. The first group of rules are executed in sequence where if a rule is applied then the stem is immediately returned from the function. The second group of rules is executed in sequence and the final result of the complete sequence yields a stem to return. */ self::$stem_step++; $word = self::unmarkLetters($word); if (mb_strlen($word) < 3) { return $word; } //Group 1 Rules //Step S1. 14 stems if (self::regexStem($word, '/^(.+?)(ιζα|ιζες|ιζε|ιζαμε|ιζατε|ιζαν|ιζανε|ιζω|ιζεις|ιζει|'. 'ιζουμε|ιζετε|ιζουν|ιζουνε)$/', ['ι' => '/^(αναμπα|εμπα|επα|ξαναπα|πα|περιπα|αθρο|συναθρο|δανε)$/', 'ιζ'=> '/^(μαρκ|κορν|αμπαρ|αρρ|βαθυρι|βαρκ|β|βολβορ|γκρ|γλυκορ|' . 'γλυκυρ|ιμπ|λ|λου|μαρ|μ|πρ|μπρ|πολυρ|π|ρ|πιπερορ)$/'], 'dummy', false, true)) { return $word; } //Step S2. 7 stems if (self::regexStem($word, '/^(.+?)(ωθηκα|ωθηκες|ωθηκε|ωθηκαμε|ωθηκατε|ωθηκαν|ωθηκανε)$/', '/^(αλ|βι|εν|υψ|λι|ζω|σ|χ)$/', 'ων', true, true)) { return $word; } //Step S3. 7 stems if (self::regexStem($word, '/^(.+?)(ισα|ισες|ισε|ισαμε|ισατε|ισαν|ισανε)$/', ['ι' => '/^(αναμπα|αθρο|εμπα|εσε|εσωκλε|επα|ξαναπα|επε|περιπα|'. 'αθρο|συναθρο|δανε|κλε|χαρτοπα|εξαρχα|μετεπε|αποκλε|απεκλε|'. 'εκλε|πε|περιπα)$/', "ισ" => '/^(αν|αφ|γε|γιγαντοαφ|γκε|δημοκρατ|κομ|γκ|μ|π|' . 'πουκαμ|ολο|λαρ)$/'], 'dummy', false, true)) { return $word; } //Step S4. 7 stems if (self::regexStem($word, '/^(.+?)(ισω|ισεις|ισει|ισουμε|ισετε|ισουν|ισουνε)$/', '/^(αναμπα|εμπα|εσε|εσωκλε|επα|ξαναπα|επε|περιπα|αθρο|'. 'συναθρο|δανε|κλε|χαρτοπα|εξαρχα|μετεπε|αποκλε|απεκλε|'. 'εκλε|πε|περιπα)$/', 'ι', true, true)) { return $word; } //Step S5. 11 stems if (self::regexStem($word, '/^(.+?)(ιστος|ιστου|ιστο|ιστε|ιστοι|ιστων|ιστους|ιστη|'. 'ιστης|ιστα|ιστες)$/', ['ιστ' => '/^(μ|π|απ|αρ|ηδ|κτ|σκ|σχ|υψ|φα|χρ|χτ|ακτ|αορ|ασχ|'. 'ατα|αχν|αχτ|γεμ|γυρ|εμπ|ευπ|εχθ|ηφα|ήφα|καθ|κακ|κυλ|λυγ|'. 'μακ|μεγ|ταχ|φιλ|χωρ)$/', 'ι'=> '/^(δανε|συναθρο|κλε|σε|εσωκλε|ασε|πλε)$/'], 'dummy', false, true)) { return $word; } //Step S6. 6 stems if (preg_match('/^(.+?)(ισμο|ισμοι|ισμος|ισμου|ισμους|ισμων)$/ui', $word, $match)) { $word = $match[1]; if (preg_match('/^(αγνωστικ|ατομικ|γνωστικ|εθνικ|εκλεκτικ|'. 'σκεπτικ|τοπικ)$/ui', $word) ) { $word = mb_ereg_replace('ικ', "", $word); } if (preg_match('/^(σε|μετασε|μικροσε|εγκλε|αποκλε)$/ui', $word)) { $word .= "ισμ"; } if (preg_match('/^(δανε|αντιδανε)$/ui', $word)) { $word .= "ι"; } if (preg_match('/^(αλεξανδριν|βυζαντιν|θεατριν)$/ui', $word)) { $word = mb_ereg_replace('ιν', "", $word); } return $word; } //Step S7. 4 stems if (self::regexStem($word, '/^(.+?)(αρακι|αρακια|ουδακι|ουδακια)$/', '/^(σ|χ)$/', "aρακ")) { return $word; } //Step S8. 8 stems if (self::regexStem($word, '/^(.+?)(ακι|ακια|ιτσα|ιτσας|ιτσες|ιτσων|αρακι|αρακια)$/', ["ακ" => '/^(ανθρ|βαμβ|βρ|καιμ|κον|κορ|λαβρ|λουλ|μερ|μουστ|'. 'ναγκας|πλ|ρ|ρυ|σ|σκ|σοκ|σπαν|τζ|φαρμ|χ|καπακ|αλισφ|αμβρ|'. 'ανθρ|κ|φυλ|κατραπ|κλιμ|μαλ|σλοβ|φ|σφ|τσεχοσλοβ)$/', "ιτσ" => '/^(β|βαλ|γιαν|γλ|ζ|ηγουμεν|καρδ|κον|μακρυν|νυφ|πατερ|π|'. 'σκ|τος|τριπολ)$/', "ιτσ" => '/(κορ)$/'], "dummy", false)) { return $word; } //Step S9. 3 stems if (self::regexStem($word, '/^(.+?)(ιδιο|ιδια|ιδιων)$/', ['/^(αιφν|ιρ|ολο|ψαλ)$/', '/(ε|παιχν)$/'], "ιδ", false)) { return $word; } //Step S10. 4 stems if (self::regexStem($word, '/^(.+?)(ισκος|ισκου|ισκο|ισκε)$/', '/^(δ|ιβ|μην|ρ|φραγκ|λυκ|οβελ)$/', "ισκ") ) { return $word; } //Group 2 Rules //Step 1 $stem_rule_applied = self::regexStem($word, '/(.*)(φαγια|φαγιου|φαγιων|σκαγια|σκαγιου|σκαγιων|ολογιου|'. 'ολογια|ολογιων|σογιου|σογια|σογιων|τατογια|τατογιου|τατογιων|'. 'κρεας|κρεατος|κρεατα|κρεατων|περας|περατος|περατη|περατα|'. 'περατων|τερας|τερατος|τερατα|τερατων|φως|φωτος|φωτα|φωτων|'. 'καθεστως|καθεστωτος|καθεστωτα|καθεστωτων|γεγονος|γεγονοτος|'. 'γεγονοτα|γεγονοτων)$/', '', '', true, true); // Step 2a. 2 stems if (preg_match('/^(.+?)(αδες|αδων)$/', $word, $match)) { $word = $match[1]; $regex = '/(οκ|μαμ|μαν|μπαμπ|πατερ|γιαγι|νταντ|κυρ|θει|πεθερ)$/'; // note the !, (which is why didn't use regexStem) if (!preg_match($regex, $word)) { $word .= "αδ"; } } //Step 2b. 2 stems self::regexStem($word, '/^(.+?)(εδες|εδων)$/', '/(οπ|ιπ|εμπ|υπ|γηπ|δαπ|κρασπ|μιλ)$/', 'εδ'); //Step 2c self::regexStem($word, '/^(.+?)(ουδες|ουδων)$/', '/(αρκ|καλιακ|πεταλ|λιχ|πλεξ|σκ|σ|φλ|φρ|βελ|λουλ|' . 'χν|σπ|τραγ|φε)$/', 'ουδ'); //Step 2d $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(εως|εων)$/', '/^(θ|δ|ελ|γαλ|ν|π|ιδ|παρ)$/', 'ε'); //Step 3 $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(ια|ιου|ιων)$/', '/'.$vowels_with_y.'$/', 'ι'); //Step 4 $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(ικα|ικο|ικου|ικων)$/', ['/' . $vowels_with_y . '$/', '/^(αλ|αδ|ενδ|αμαν|αμμοχαλ|ηθ|ανηθ|αντιδ|φυς|βρωμ|γερ|'. 'εξωδ|καλπ|καλλιν|καταδ|μουλ|μπαν|μπαγιατ|μπολ|μπος|νιτ|ξικ|'. 'συνομηλ|πετς|πιτς|πικαντ|πλιατς|ποστελν|πρωτοδ|σερτ|συναδ|'. 'τσαμ|υποδ|φιλον|φυλοδ|χας)$/'], 'ικ'); //Step 5a if ($word == "αγαμε") { $word = "αγαμ"; } $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(αγαμε|ησαμε|ουσαμε|ηκαμε|ηθηκαμε)$/', '', ''); $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(αμε)$/', '/^(αναπ|αποθ|αποκ|αποστ|βουβ|ξεθ|ουλ|πεθ|πικρ|ποτ|σιχ|χ)$/', "αμ"); //Step 5b $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(αγανε|ησανε|ουσανε|ιοντανε|'. 'ιοτανε|ιουντανε|οντανε|οτανε|ουντανε|ηκανε|ηθηκανε)$/', '/^(τρ|τς)$/', "αγαν"); $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(ανε)$/', ['/'.$vowels_no_y.'$/', '/^(βετερ|βουλκ|βραχμ|γ|δραδουμ|θ|καλπουζ|καστελ|κορμορ|' . 'λαοπλ|μωαμεθ|μ|μουσουλμ|ν|ουλ|π|πελεκ|πλ|πολις|πορτολ|' . 'σαρακατς|σουλτ|τσαρλατ|ορφ|τσιγγ|τσοπ|φωτοστεφ|χ|ψυχοπλ|' . 'αγ|ορφ|γαλ|γερ|δεκ|διπλ|αμερικαν|ουρ|πιθ|πουριτ|σ|ζωντ|ικ|' . 'καστ|κοπ|λιχ|λουθηρ|μαιντ|μελ|σιγ|σπ|στεγ|τραγ|τσαγ|φ|ερ|' . 'αδαπ|αθιγγ|αμηχ|ανικ|ανοργ|απηγ|απιθ|ατσιγγ|βας|βασκ|βαθυγαλ|' . 'βιομηχ|βραχυκ|διατ|διαφ|ενοργ|θυς|καπνοβιομηχ|καταγαλ|' . 'κλιβ|κοιλαρφ|λιβ|μεγλοβιομηχ|μικροβιομηχ|νταβ|ξηροκλιβ|' . 'ολιγοδαμ|ολογαλ|πενταρφ|περηφ|περιτρ|πλατ|πολυδαπ|πολυμηχ|' . 'στεφ|ταβ|τετ|υπερηφ|υποκοπ|χαμηλοδαπ|ψηλοταβ)$/'], "αν"); //Step 5c $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(ησετε)$/', '', ''); $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(ετε)$/', ['/'.$vowels_no_y.'$/', '/(οδ|αιρ|φορ|ταθ|διαθ|σχ|ενδ|ευρ|τιθ|υπερθ|ραθ|ενθ|'. 'ροθ|σθ|πυρ|αιν|συνδ|συν|συνθ|χωρ|πον|βρ|καθ|ευθ|εκθ|νετ|ρον|'. 'αρκ|βαρ|βολ|ωφελ)$/', '/^(αβαρ|βεν|εναρ|αβρ|αδ|αθ|αν|απλ|βαρον|ντρ|σκ|κοπ|'. 'μπορ|νιφ|παγ|παρακαλ|σερπ|σκελ|συρφ|τοκ|υ|δ|εμ|θαρρ|θ)$/' ], "ετ"); //Step 5d $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(οντας|ωντας)$/', ["οντ" => '/^(αρχ)$/', "ωντ" => '/(κρε)$/']); //Step 5e $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(ομαστε|ιομαστε)$/', '/^(ον)$/', "ομαστ"); //Step 5f $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(ιεστε)$/', '/^(π|απ|συμπ|ασυμπ|ακαταπ|αμεταμφ)$/', "ιεστ"); $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(εστε)$/', '/^(αλ|αρ|εκτελ|ζ|μ|ξ|παρακαλ|αρ|προ|νις)$/', "εστ"); //Step 5g $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(ηθηκα|ηθηκες|ηθηκε)$/', '',''); $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(ηκα|ηκες|ηκε)$/', ['/(σκωλ|σκουλ|ναρθ|σφ|οθ|πιθ)$/', '/^(διαθ|θ|παρακαταθ|προσθ|συνθ|)$/' ], "ηκ"); //Step 5h $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(ουσα|ουσες|ουσε)$/', [ '/^(φαρμακ|χαδ|αγκ|αναρρ|βρομ|εκλιπ|λαμπιδ|λεχ|μ|πατ|' . 'ρ|λ|μεδ|μεσαζ|υποτειν|αμ|αιθ|ανηκ|δεσποζ|ενδιαφερ|δε|' . 'δευτερευ|καθαρευ|πλε|τσα)$/', '/(ποδαρ|βλεπ|πανταχ|φρυδ|μαντιλ|μαλλ|κυματ|λαχ|ληγ|' . 'φαγ|ομ|πρωτ)$/', ], "ουσ"); //Step 5i $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(αγα|αγες|αγε)$/', ['/^(ψοφ|ναυλοχ)$/', '/(κολλ)$/', '/^(αβαστ|πολυφ|αδηφ|παμφ|ρ|ασπ|αφ|αμαλ|αμαλλι|ανυστ|' . 'απερ|ασπαρ|αχαρ|δερβεν|δροσοπ|ξεφ|νεοπ|νομοτ|ολοπ|ομοτ|προστ|' . 'προσωποπ|συμπ|συντ|τ|υποτ|χαρ|αειπ|αιμοστ|ανυπ|αποτ|αρτιπ|' . 'διατ|εν|επιτ|κροκαλοπ|σιδηροπ|λ|ναυ|ουλαμ|ουρ|π|τρ|μ)$/', '/(οφ|πελ|χορτ|λλ|σφ|ρπ|φρ|πρ|λοχ|σμην)$/'], "αγ"); //Step 5j $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(ησε|ησου|ησα)$/', '/^(ν|χερσον|δωδεκαν|ερημον|μεγαλον|επταν)$/', "ησ"); //Step 5k $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(ηστε)$/', '/^(ασβ|σβ|αχρ|χρ|απλ|αειμν|δυσχρ|ευχρ|κοινοχρ|παλιμψ)$/', "ηστ"); //Step 5la $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(ουνε|ησουνε|ηθουνε)$/', '/^(ν|ρ|σπι|στραβομουτς|κακομουτς|εξων)$/', "ουν"); //Step 5lb $stem_rule_applied = $stem_rule_applied || self::regexStem($word, '/^(.+?)(ουμε|ησουμε|ηθουμε)$/', '/^(παρασους|φ|χ|ωριοπλ|αζ|αλλοσους|ασους)$/', "ουμ"); // Step 6 self::regexStem($word, '/^(.+?)(ματα|ματων|ματος)$/', '', "μα"); if (!$stem_rule_applied) { self::regexStem($word, '/^(.+?)(α|αγατε|αγαν|αει|αμαι|αν|ας|ασαι|αται|αω|ε|ει|εις|' . 'ειτε|εσαι|ες|εται|ι|ιεμαι|ιεμαστε|ιεται|ιεσαι|ιεσαστε|' . 'ιομασταν|ιομουν|ιομουνα|ιονταν|ιοντουσαν|ιοσασταν|ιοσαστε|'. 'ιοσουν|ιοσουνα|ιοταν|ιουμα|ιουμαστε|ιουνται|ιουνταν|η|ηδες|'. 'ηδων|ηθει|ηθεις|ηθειτε|ηθηκατε|ηθηκαν|ηθουν|ηθω|ηκατε|ηκαν|'. 'ης|ησαν|ησατε|ησει|ησες|ησουν|ησω|ο|οι|ομαι|ομασταν|ομουν|'. 'ομουνα|ονται|ονταν|οντουσαν|ος|οσασταν|οσαστε|οσουν|'. 'οσουνα|οταν|ου|ουμαι|ουμαστε|ουν|ουνται|ουνταν|ους|ουσαν|'. 'ουσατε|υ|υς|ω|ων)$/', '', ''); } // Step 7 (ΠΑΡΑΘΕΤΙΚΑ) self::regexStem($word, '/^(.+?)(εστερ|εστατ|οτερ|οτατ|υτερ|υτατ|ωτερ|ωτατ)$/', '', ''); return $word; } /** * Check is $word matches the regex $capture_stem_regex. If so * chenages $word to the capture group of that regex. It then * checks the regexes in $exception_regexes either in sequences or until * first match. If a match is found $word has a corresponding exception * stem added back to its end. * * @param string &$word term to be stemmed * @param string $capture_stem_regex a regex of format: * /^(stem_pattern)(suffix_pattern)$/ to check against $word. ui is added * to the pattern before used to enable unicode. * @param mixed $exception_regexes either a string single exception * suffix to look for or an array of suffixes to look for, or * an associative array of items append_stem => exception_regex * @param string $exception_stem if $exception_regexes is not an * associative array this should be the suffix to append to word if * an exception_regex matches * @param bool $with_break if true, then the checking of $exception_regexes * is only done till the first match is found. If false, all regexes * are checked against * @param bool $use_suffix if true and $word watches $capture_stem_regex. * then suffix_pattern is looked up as a key in the map * self::$suffix_patterns. If found, the corresponding value is appendded * $word. * @return boolean whether word matched $capture_stem_regex */ public static function regexStem(&$word, $capture_stem_regex, $exception_regexes, $exception_stem = "dummy", $with_break = true, $use_suffix = false) { self::$stem_step ??= 0; self::$stem_step++; if (preg_match($capture_stem_regex . "ui", $word, $match)) { $suffix = self::$suffix_patterns[$match[2]] ?? ""; $suffix = ($use_suffix) ? $suffix : ""; $word = $match[1] . $suffix; if (!empty($exception_stem)) { if (!is_array($exception_regexes)) { $exception_regexes = [$exception_regexes]; } foreach ($exception_regexes as $potential_except_stem => $exception_regex) { if (empty($exception_regex) || preg_match($exception_regex . "ui", $word)) { if (is_int($potential_except_stem)) { $word .= $exception_stem; } else { $word .= $potential_except_stem; } if ($with_break) { break; } } } } return true; } return false; } /** * Removes the stop words from the page (used for Word Cloud generation) * * @param mixed $data either a string or an array of string to remove * stop words from * @return mixed $data with no stop words */ public static function stopwordsRemover($data) { static $pattern = ""; if (empty($pattern)) { $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/ui'; } $data = preg_replace($pattern, '', $data); return $data; } /** * Used to remove some diacritic marks from greek characters in a term * * @param string $word term to remove diacritic marks from * @return string with marks removed */ static function unmarkLetters($word) { $start = 0; $len = mb_strlen($word); $out_word = ""; for($i = 0; $i < $len; $i++) { $cur_char = mb_substr($word, $i, 1); $out_char = self::$letter_map[$cur_char] ?? $cur_char; $out_word .= $out_char; } return $out_word; } }