Last commit for src/library/StochasticTermSegmenter.php: 01458ec738bd61e61fa637fd7840caf1459e779c

Reorganizing IndexDocumentBundle folder structures, attempt to fix bug wherein thumbs not showing on wiki media pages

Chris Pollett [2024-01-09 02:Jan:th]

Reorganizing IndexDocumentBundle folder structures, attempt to fix bug wherein thumbs not showing on wiki media pages

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2020  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * @author Xianghong Sun sxh19911230@gmail.com
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2020
 * @filesource
 */
namespace seekquarry\yioop\library;

use seekquarry\yioop\locale\zh_CN\resources as ZH;
use seekquarry\yioop\configs as C;
/**
 * A Stochastic Finite-State Word-Segmenter.
 * This class contains necessary tools to segment terms
 * from sentences.
 *
 * Currently only supports Chinese.
 * Instruction to add a new language:
 * Add a switch case in the constructor.
 * Define the following function:
 * isExceptionImpl
 * See the class function 'isException' for more information
 * isPunctuationImpl
 * See the class function 'isPunctuation' for more information
 * isNotCurrentLangImpl
 * See the class function 'notCurrentLang' for more information
 * Chinese example is provided in the constructor
 *
 * @author Xianghong Sun
 */
class StochasticTermSegmenter
{
    /**
     * Percentage for cache entries. Value should be between 0 and 1.0
     * Set to small number when running on memory limited machines
     * Here is a general comparison when setting it to 0 and 1:
     * In the test of Chinese Segmentation on pku dataset,
     * the peak usage of memory is 26.288MB vs. 151.46MB
     * The trade off is some efficiency,
     * In the test of Chinese Segmentation on pku dataset,
     * the speed is 43.803s vs. 1.540s
     * Default value = 0.06
     * The time and Peak Memory are 5.094 s and 98.97MB
     * @var number from 0 - 1.0
     */
    private $cache_pct;
    /**
     * Cache. Will have runtime data for the segmentation
     * @var array
     */
    private $cache=[];
    /**
     * The language currently being used  e.g. zh_CN, ja
     * @var string
     */
    public $lang;
    /**
     * regular expression to determine if the non of the char in this
     * term is in current language
     * Recommanded expression for:
     * Chinese:  \p{Han}
     * Japanese: \x{4E00}-\x{9FBF}\x{3040}-\x{309F}\x{30A0}-\x{30FF}
     * Korean:   \x{3130}-\x{318F}\x{AC00}-\x{D7AF}
     * @var string
     */
    public $non_char_preg;
    /**
     * Default score for any unknown term
     * @var float
     */
    public $unknown_term_score;
    /**
     * A dictionary file that contains the statistic infomation of
     * the terms
     * @var array
     */
    public $dictionary_file;
    /**
     * Construct an instance of this class used for segmenting string with
     * respect to words in a locale using a probabilistic approach to evaluate
     * segmentation possibilities.
     * @param string $lang is a string to indicate the language
     */
    function __construct($lang, $cache_pct = 0.06)
    {
        $this->cache_pct = $cache_pct;
        /* Add different attribute for different languages
         * Currently only Chinese
         */
        switch($lang)
        {
            case "zh_CN":
            case "zh-CN":
                $this->lang = "zh_CN";
                /*
                 * Check if the term passed in is an exception term
                 */
                $this->isExceptionImpl = function($term) {
                    return ZH\Tokenizer::isCardinalNumber($term)
                    || ZH\Tokenizer::isOrdinalNumber($term)
                    || ZH\Tokenizer::isDate($term);
                };
                /*
                 * Check if the term passed in is a punctuation
                 */
                $this->isPunctuationImpl = function($term)
                {
                    return ZH\Tokenizer::isPunctuation($term);
                };
                /*
                 * Check if all the chars in the term is NOT current language
                 */
                $this->isNotCurrentLangImpl = function($term)
                {
                    return ZH\Tokenizer::isNotCurrentLang($term);
                };
                /*
                 * named entity recognizer;
                 */
                $this->NER = ZH\Tokenizer::getNER();
                break;
            default:
                $this->lang = $lang;
        }
    }
    /**
     * __call  for calling dynamic methods
     * @param string $method method of this class to call
     * @param array $args arguments to pass to method
     * @return mixed result of method calculation
     */
    public function __call($method, $args)
    {
        return call_user_func_array($this->$method, $args);
    }
    /**
     *  __get  for getting dynamic variables
     * @param string $var_name variable to retrieve
     * @return mixed result of retrieval
     */
    public function __get($var_name)
    {
        return $this->$var_name;
    }
    /**
     *  __set  for assigning dynamic variables
     * @param string $var_name variable to assign
     * @param  mixed $value value to assign to it
     */
    public function __set($var_name, $value)
    {
        $this->$var_name = $value;
    }
    /**
     * Check if the term passed in is an exception term
     * Not all valid terms should be indexed.
     * e.g. there are infinite combinations of numbers in the world.
     * isExceptionImpl should be defined in constructor if needed
     * @param $term is a string that to be checked
     * @return true if $term is an exception term, false otherwise
     */
    public function isException($term)
    {
        if (isset($this->isExceptionImpl))
            return $this->isExceptionImpl($term);
        return false;
    }
    /**
     * Check if the term passed in is a punctuation
     * isPunctuationImpl should be defined in constructor if needed
     * @param $term is a string that to be checked
     * @return true if $term is a punctuation, false otherwise
     */
    public function isPunctuation($term)
    {
        if (isset($this->isPunctuationImpl))
            return $this->isPunctuationImpl($term);
        return false;
    }
    /**
     * Check if all the chars in the term is NOT current language
     * @param $term is a string that to be checked
     * @return bool true if all the chars in $term is NOT current language
     *         false otherwise
     */
    public function notCurrentLang($term)
    {
        if (isset($this->isNotCurrentLangImpl))
            return $this->isNotCurrentLangImpl($term);
        return false;
    }
    /**
     * Generate a term dictionary file for later segmentation
     * @param mixed $text_files is a string name or an array of files
     *  that to be trained; words in the files need to be segmented by space
     * @param string $format currently only support default and CTB
     * @return bool true if success
     */
    public function train($text_files, $format = "default")
    {
        $ctb_fmt=false;
        switch ($format) {
            case("default"):
                break;
            case("CTB"):
                $ctb_fmt=true;
                break;
            default:
                echo "Unrecognized format";
                exit();
        }
        $out_file = C\LOCALE_DIR .
            "/{$this->lang}/resources/term_weight.txt.gz";
        echo "Saving file to: $out_file\n";
        $dictionary = [];
        $N = 0;
        if (is_string($text_files)) {
            $text_files = [$text_files];
        }
        foreach($text_files as $text_file) {
            if (file_exists($text_file) && !is_dir($text_file)) {
                $fh = fopen($text_file, "r");
                while(!feof($fh))  {
                    $line = fgets($fh);
                    if ($ctb_fmt and preg_match('/^<.*>$/', trim($line))) {
                        continue;
                    }
                    $words = preg_split("/[\s　]+/u", $line);
                    foreach ($words as $word) {
                        if ($word != "" && !$this->isException($word)
                            && !$this->notCurrentLang($word)) {
                            if (!empty($dictionary[$word])) {
                                $dictionary[$word]++;
                            } else if (mb_strlen($word) < 7) {
                                $dictionary[$word] = 1;
                            }
                        }
                    }
                }
                fclose($fh);
            }
        }
        $this->dictionary_file = [];
        $this->dictionary_file["N"] = 0;
        $this->dictionary_file["dic"] = [];
        ksort ($dictionary);
        $start_char = null;
        $tmp_array=[];
        foreach ($dictionary as $key => $value) {
            if (mb_substr($key,0,1)!=$start_char) {
                $this->dictionary_file["dic"][$start_char]
                    = json_encode($tmp_array[$start_char]);
                $tmp_array=[];
                $start_char=mb_substr($key,0,1);
            }
            $this->add($key, $value, $tmp_array);
            $this->dictionary_file["N"]++;
        }
        $this->unknown_term_score = $this->getScore(1);
        file_put_contents($out_file,
            gzencode(json_encode($this->dictionary_file), 9));
        return true;
    }

    /**
     * This function is used to segment a list of files
     * @param $text_files can be a file name or a list of file names
     *        to be segmented
     * @param bool $return_string return segmented string if true,
     *        print to stdout otherwise
     *        user can use > filename to output it to a file
     * @return string segmented words with space or true/false;
     */
    public function segmentFiles($text_files, $return_string = false)
    {
        if ($return_string) {
            $result = "";
        }
        if (is_string($text_files)) {
            $text_files = [$text_files];
        }
        foreach($text_files as $text_file) {
            if (file_exists($text_file)) {
                $fh = fopen($text_file, "r");
                while(! feof($fh))  {
                    $line = fgets($fh);
                    if (mb_strlen($line)) {
                        $t = $this->segmentSentence($line);
                        if ($return_string) {
                            $result .= join( " ", $t) ."\n" ;
                        } else {
                            echo join(" ", $t) . "\n";
                        }
                    }
                }
                fclose($fh);
            } else {
                echo "cannot open $text_file\n";
            }
        }
        if ($return_string) {
            return $result;
        }
        return true;
    }
    /**
     * Segment texts. Words are seperated by space
     * @param string $text  to be segmented
     * @param bool $return_string return segmented string if true,
     *        print otherwise
     * @return string segmented words with space or true/false;
     */
    public function segmentText($text, $return_string = false)
    {
        if ($return_string) {
            $result = "";
        }
        $sentences = explode("\n", $text);
        foreach ($sentences as $line) {
            if (mb_strlen($line)) {
                $t = $this->segmentSentence($line);
                if ($return_string) {
                    $result .= join( " ", $t) . "\n";
                } else {
                    echo join( " ", $t) . "\n";
                }
            }
        }
        if ($return_string) {
            return mb_substr($result, 0, -1);
        }
        return true;
    }
    /**
     * Segment a sentence into arrays of words.
     * Need NOT contain any new line characters.
     * @param string $sentence is a string without newline to be segmented
     * @return array of segmented words
     */
    public function segmentSentence($sentence)
    {
        $t=preg_split("/[\s　]+/u", trim($sentence));
        if(count($t) > 1) {
            $ret = [];
            foreach($t as $s) {
                $ret=array_merge($ret,$this->segmentSentence($s));
            }
            return $ret;
        }
        if (!$this->dictionary_file) {
            $dic_file = C\LOCALE_DIR .
                "/{$this->lang}/resources/term_weight.txt.gz";
            if (!file_exists($dic_file)) {
                crawlLog("$dic_file does not exist!");
                return null;
            }
            $this->dictionary_file =
                json_decode(gzdecode(file_get_contents($dic_file)), true);
            gc_collect_cycles();
            $this->unknown_term_score = $this->getScore(1);
        }
        $cache_size =
            floor(count($this->dictionary_file['dic']) * $this->cache_pct);
        if ($cache_size == 0) {
            $cache_size = 1;
        }
        preg_match_all('/./u', trim($sentence), $matches);
        $characters = $matches[0];
        if (!count($characters)) {
            return [];
        }
        $ner_dict=[];
        if (isset($this->NER)) {
            $named_entities=$this->NER->predict($characters);
            foreach($named_entities as $e) {
                $this->add($e[0],1,$ner_dict);
            }
        }
        $score = [];
        $path = [];
        //init base
        $score[-1] = 0;
        for($index = 0; $index < count($characters); $index++) {
            //If not current language
            if ($this->notCurrentLang($characters[$index])
                && !$this->isPunctuation($characters[$index])) {
                $current_char = $characters[$index];
                for($j = $index + 1; $j < count($characters); $j++) {
                    if ($this->notCurrentLang($current_char.$characters[$j])
                        && !$this->isPunctuation($characters[$j])) {
                        $current_char .= $characters[$j];
                    } else {
                        break;
                    }
                }
                if (!isset($score[$j - 1]) ||  $score[$j - 1] >
                    $score[$index - 1] + $this->unknown_term_score) {
                    $score[$j - 1] = $score[$index - 1] +
                        $this->unknown_term_score;
                    $path[$j - 1] = $index - 1;
                }
            }
            //If date or number
            if ($this->isException($characters[$index]) ) {
                $current_char = $characters[$index];
                for($j = $index+1; $j<count($characters); $j++) {
                    if (!$this->isException(
                        $current_char . $characters[$j])) {
                        break;
                    }
                    $current_char .= $characters[$j];
                }
                if (!isset($score[$j - 1]) ||
                    $score[$j - 1] > $score[$index - 1] +
                        $this->unknown_term_score) {
                    $score[$j - 1] = $score[$index - 1] +
                        $this->unknown_term_score;
                    $path[$j - 1] = $index - 1;
                }
            }
            //If is punctuation, give slightly better score than unknown words
            if ($this->isPunctuation($characters[$index])) {
                $current_char = $characters[$index];
                for($j = $index+1; $j<count($characters); $j++) {
                    if (!$this->isPunctuation(
                        $current_char . $characters[$j])) {
                        break;
                    }
                    $current_char .= $characters[$j];
                }
                if (!isset($score[$j - 1]) ||
                    $score[$j - 1] > $score[$index - 1] +
                        $this->unknown_term_score / 1.1) {
                    $score[$j - 1] = $score[$index - 1] +
                        $this->unknown_term_score  / 1.1;
                    $path[$j - 1] = $index - 1;
                }
            }
            /* All case (Even not in current lang because dictionary may
                contains those terms
                check the first char, give score even nothing matches
             */
            if (!isset($score[$index]) ||
                $score[$index-1] + $this->unknown_term_score < $score[$index]) {
                $score[$index] = $score[$index-1] +
                    $this->unknown_term_score;
                $path[$index] = $index - 1;
            }
            //if entry exists, look for the term
            if (isset($this->dictionary_file["dic"][$characters[$index]])) {
                if (!isset($this->cache[$characters[$index]])) {
                    $this->cache = [$characters[$index] =>
                        json_decode(
                        $this->dictionary_file["dic"][$characters[$index]],
                        true)] + $this->cache;
                    while (count($this->cache) > $cache_size) {
                        array_pop($this->cache);
                    }
                }
                $subdic = $this->cache;
                for ($j = $index; $j < count($characters); $j++) {
                    if (!isset($subdic[$characters[$j]])) {
                        break;
                    }
                    $subdic = $subdic[$characters[$j]];
                    if (isset($subdic['$']) && (!isset($score[$j]) ||
                        (isset($score[$index - 1]) &&
                        $score[$index - 1] + $subdic['$'] < $score[$j]))) {
                        $score[$j] = $score[$index - 1] +
                            $this->getScore($subdic['$']);
                        $path[$j] = $index - 1;
                    }
                }
            }
            //check NER dictionary
            if (isset($ner_dict[$characters[$index]])) {
                $subdic = $ner_dict;
                for ($j = $index; $j < count($characters); $j++) {
                    if (!isset($subdic[$characters[$j]])) {
                        break;
                    }
                    $subdic = $subdic[$characters[$j]];
                    if (isset($subdic['$']) && (!isset($score[$j]) ||
                        (isset($score[$index - 1]) &&
                        $score[$index - 1] + $subdic['$'] < $score[$j]))) {
                        $score[$j] = $score[$index - 1] +
                            $this->getScore($subdic['$']);
                        $path[$j] = $index - 1;
                    }
                }
            }
        }
        //trace path
        $t = max(array_keys($path));
        $tmp = [];
        while($t != -1) {
            $tmp[] = $t;
            $t = $path[$t];
        }
        $result = [];
        $t = 0;
        foreach(array_reverse($tmp) as $nextnode) {
            $result_word = "";
            while($t <= $nextnode) {
              $result_word .= $characters[$t];
              $t++;
            }
            $result[] = $result_word;
        }
        return $result;
    }
    /**
     * This is the function to calculate scores for each word
     * @param int $frequency is an integer tells the frequency of a word
     * @return float the score of the term.
     */
    private function getScore($frequency)
    {
        if (!empty($this->dictionary_file["N"]) &&
            is_numeric($this->dictionary_file["N"])) {
            return -log($frequency / $this->dictionary_file["N"]);
        } else {
            return 0;
        }
    }
    /**
     * Adds a term to the dictionary
     *
     * @param string $key the term to be inserted
     * @param string $value the frequency to be inserted
     * @param array $array for insertion
     */
    private function add($key, $value, & $array)
    {
        $trie_array = & $array;
        for ($i = 0; $i < mb_strlen($key,"utf-8"); $i++) {
            $character = mb_substr($key, $i, 1, "utf-8");
            $enc_char = $character;
            // If letter doesnt exist then create one by
            // assigning new array
            if (!isset($trie_array[$enc_char])) {
                $trie_array[$enc_char] = [];
            }
            $trie_array = & $trie_array[$enc_char];
        }
        // Set end of term marker
        $trie_array['$'] = $value;
    }
}

ViewGit