Last commit for src/locale/tr/resources/Tokenizer.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]

Adjust copyrights years

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * @author Chris Pollett chris@pollett.org
 *
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\locale\tr\resources;

/**
 * Turkish specific tokenization code. Typically, tokenizer.php
 * either contains a stemmer for the language in question or
 * it specifies how many characters in a char gram
 *
 * @author Chris Pollett
 */
class Tokenizer
{
    /**
     * A list of frequently occurring terms for this locale which should
     * be excluded from certain kinds of queries. This is also used
     * for language detection
     * @array
     */
    public static $stop_words = ['olarak', 'ben', 'onun', 'bu', 'diye',
        'oldu', 'için', 'üzerinde', 'vardır', 'ile', 'onlar', 'olmak', 'at',
        'bir', 'var', 'Bu', 'dan', 'tarafından', 'sıcak', 'kelime', 'ancak',
        'ne', 'bazı', 'olduğunu', 'o', 'sen', 'veya', 'vardı', '', 'arasında',
        'karşı', 've', 'bir', 'içinde', 'biz', 'can', 'üzerinden', 'diğer',
        'vardı', 'hangi', 'do', 'onların', 'zaman', 'eğer', 'olacak',
        'nasıl', 'dedi', 'bir', 'her', 'söyle', 'yok', 'set', 'üç',
        'istiyorum', 'hava', 'iyi', 'ayrıca', 'oynamak', 'küçük', 'son',
        'koymak', 'ev', 'okumak', 'el', 'liman', 'büyük', 'büyü', 'ekleyin',
        'hatta', 'arazi', 'burada', 'gerekir', 'büyük', 'yüksek', 'böyle',
        'izleyin', 'hareket', 'neden', 'sormak', 'erkekler', 'değişim',
        'gitti', 'ışık', 'tür', 'kapalı', 'gerek', 'ev', 'resim', 'denemek',
        'bizi', 'tekrar', 'hayvan', 'nokta', 'anne', 'dünya', 'yakın',
        'inşa', 'etmek', 'öz', 'toprak', 'baba'];
    /**
     * How many characters in a char gram for this locale
     * @var int
     */
    public static $char_gram_len = 5;
    /**
     * Removes the stop words from the page (used for Word Cloud generation
     * and language detection)
     *
     * @param mixed $data either a string or an array of string to remove
     *      stop words from
     * @return mixed $data with no stop words
     */
    public static function stopwordsRemover($data)
    {
        static $pattern = "";
        if (empty($pattern)) {
            $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u';
        }
        $data = preg_replace($pattern, '', $data);
        return $data;
    }
}

ViewGit