<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2020 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * @author Xianghong Sun sxh19911230@gmail.com * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2020 * @filesource */ namespace seekquarry\yioop\library; use seekquarry\yioop\locale\zh_CN\resources as ZH; use seekquarry\yioop\configs as C; /** * A Stochastic Finite-State Word-Segmenter. * This class contains necessary tools to segment terms * from sentences. * * Currently only supports Chinese. * Instruction to add a new language: * Add a switch case in the constructor. * Define the following function: * isExceptionImpl * See the class function 'isException' for more information * isPunctuationImpl * See the class function 'isPunctuation' for more information * isNotCurrentLangImpl * See the class function 'notCurrentLang' for more information * Chinese example is provided in the constructor * * @author Xianghong Sun */ class StochasticTermSegmenter { /** * Percentage for cache entries. Value should be between 0 and 1.0 * Set to small number when running on memory limited machines * Here is a general comparison when setting it to 0 and 1: * In the test of Chinese Segmentation on pku dataset, * the peak usage of memory is 26.288MB vs. 151.46MB * The trade off is some efficiency, * In the test of Chinese Segmentation on pku dataset, * the speed is 43.803s vs. 1.540s * Default value = 0.06 * The time and Peak Memory are 5.094 s and 98.97MB * @var number from 0 - 1.0 */ private $cache_pct; /** * Cache. Will have runtime data for the segmentation * @var array */ private $cache=[]; /** * The language currently being used e.g. zh_CN, ja * @var string */ public $lang; /** * regular expression to determine if the non of the char in this * term is in current language * Recommanded expression for: * Chinese: \p{Han} * Japanese: \x{4E00}-\x{9FBF}\x{3040}-\x{309F}\x{30A0}-\x{30FF} * Korean: \x{3130}-\x{318F}\x{AC00}-\x{D7AF} * @var string */ public $non_char_preg; /** * Default score for any unknown term * @var float */ public $unknown_term_score; /** * A dictionary file that contains the statistic infomation of * the terms * @var array */ public $dictionary_file; /** * Construct an instance of this class used for segmenting string with * respect to words in a locale using a probabilistic approach to evaluate * segmentation possibilities. * @param string $lang is a string to indicate the language */ function __construct($lang, $cache_pct = 0.06) { $this->cache_pct = $cache_pct; /* Add different attribute for different languages * Currently only Chinese */ switch($lang) { case "zh_CN": case "zh-CN": $this->lang = "zh_CN"; /* * Check if the term passed in is an exception term */ $this->isExceptionImpl = function($term) { return ZH\Tokenizer::isCardinalNumber($term) || ZH\Tokenizer::isOrdinalNumber($term) || ZH\Tokenizer::isDate($term); }; /* * Check if the term passed in is a punctuation */ $this->isPunctuationImpl = function($term) { return ZH\Tokenizer::isPunctuation($term); }; /* * Check if all the chars in the term is NOT current language */ $this->isNotCurrentLangImpl = function($term) { return ZH\Tokenizer::isNotCurrentLang($term); }; /* * named entity recognizer; */ $this->NER = ZH\Tokenizer::getNER(); break; default: $this->lang = $lang; } } /** * __call for calling dynamic methods * @param string $method method of this class to call * @param array $args arguments to pass to method * @return mixed result of method calculation */ public function __call($method, $args) { return call_user_func_array($this->$method, $args); } /** * __get for getting dynamic variables * @param string $var_name variable to retrieve * @return mixed result of retrieval */ public function __get($var_name) { return $this->$var_name; } /** * __set for assigning dynamic variables * @param string $var_name variable to assign * @param mixed $value value to assign to it */ public function __set($var_name, $value) { $this->$var_name = $value; } /** * Check if the term passed in is an exception term * Not all valid terms should be indexed. * e.g. there are infinite combinations of numbers in the world. * isExceptionImpl should be defined in constructor if needed * @param $term is a string that to be checked * @return true if $term is an exception term, false otherwise */ public function isException($term) { if (isset($this->isExceptionImpl)) return $this->isExceptionImpl($term); return false; } /** * Check if the term passed in is a punctuation * isPunctuationImpl should be defined in constructor if needed * @param $term is a string that to be checked * @return true if $term is a punctuation, false otherwise */ public function isPunctuation($term) { if (isset($this->isPunctuationImpl)) return $this->isPunctuationImpl($term); return false; } /** * Check if all the chars in the term is NOT current language * @param $term is a string that to be checked * @return bool true if all the chars in $term is NOT current language * false otherwise */ public function notCurrentLang($term) { if (isset($this->isNotCurrentLangImpl)) return $this->isNotCurrentLangImpl($term); return false; } /** * Generate a term dictionary file for later segmentation * @param mixed $text_files is a string name or an array of files * that to be trained; words in the files need to be segmented by space * @param string $format currently only support default and CTB * @return bool true if success */ public function train($text_files, $format = "default") { $ctb_fmt=false; switch ($format) { case("default"): break; case("CTB"): $ctb_fmt=true; break; default: echo "Unrecognized format"; exit(); } $out_file = C\LOCALE_DIR . "/{$this->lang}/resources/term_weight.txt.gz"; echo "Saving file to: $out_file\n"; $dictionary = []; $N = 0; if (is_string($text_files)) { $text_files = [$text_files]; } foreach($text_files as $text_file) { if (file_exists($text_file) && !is_dir($text_file)) { $fh = fopen($text_file, "r"); while(!feof($fh)) { $line = fgets($fh); if ($ctb_fmt and preg_match('/^<.*>$/', trim($line))) { continue; } $words = preg_split("/[\s ]+/u", $line); foreach ($words as $word) { if ($word != "" && !$this->isException($word) && !$this->notCurrentLang($word)) { if (!empty($dictionary[$word])) { $dictionary[$word]++; } else if (mb_strlen($word) < 7) { $dictionary[$word] = 1; } } } } fclose($fh); } } $this->dictionary_file = []; $this->dictionary_file["N"] = 0; $this->dictionary_file["dic"] = []; ksort ($dictionary); $start_char = null; $tmp_array=[]; foreach ($dictionary as $key => $value) { if (mb_substr($key,0,1)!=$start_char) { $this->dictionary_file["dic"][$start_char] = json_encode($tmp_array[$start_char]); $tmp_array=[]; $start_char=mb_substr($key,0,1); } $this->add($key, $value, $tmp_array); $this->dictionary_file["N"]++; } $this->unknown_term_score = $this->getScore(1); file_put_contents($out_file, gzencode(json_encode($this->dictionary_file), 9)); return true; } /** * This function is used to segment a list of files * @param $text_files can be a file name or a list of file names * to be segmented * @param bool $return_string return segmented string if true, * print to stdout otherwise * user can use > filename to output it to a file * @return string segmented words with space or true/false; */ public function segmentFiles($text_files, $return_string = false) { if ($return_string) { $result = ""; } if (is_string($text_files)) { $text_files = [$text_files]; } foreach($text_files as $text_file) { if (file_exists($text_file)) { $fh = fopen($text_file, "r"); while(! feof($fh)) { $line = fgets($fh); if (mb_strlen($line)) { $t = $this->segmentSentence($line); if ($return_string) { $result .= join( " ", $t) ."\n" ; } else { echo join(" ", $t) . "\n"; } } } fclose($fh); } else { echo "cannot open $text_file\n"; } } if ($return_string) { return $result; } return true; } /** * Segment texts. Words are seperated by space * @param string $text to be segmented * @param bool $return_string return segmented string if true, * print otherwise * @return string segmented words with space or true/false; */ public function segmentText($text, $return_string = false) { if ($return_string) { $result = ""; } $sentences = explode("\n", $text); foreach ($sentences as $line) { if (mb_strlen($line)) { $t = $this->segmentSentence($line); if ($return_string) { $result .= join( " ", $t) . "\n"; } else { echo join( " ", $t) . "\n"; } } } if ($return_string) { return mb_substr($result, 0, -1); } return true; } /** * Segment a sentence into arrays of words. * Need NOT contain any new line characters. * @param string $sentence is a string without newline to be segmented * @return array of segmented words */ public function segmentSentence($sentence) { $t=preg_split("/[\s ]+/u", trim($sentence)); if(count($t) > 1) { $ret = []; foreach($t as $s) { $ret=array_merge($ret,$this->segmentSentence($s)); } return $ret; } if (!$this->dictionary_file) { $dic_file = C\LOCALE_DIR . "/{$this->lang}/resources/term_weight.txt.gz"; if (!file_exists($dic_file)) { crawlLog("$dic_file does not exist!"); return null; } $this->dictionary_file = json_decode(gzdecode(file_get_contents($dic_file)), true); gc_collect_cycles(); $this->unknown_term_score = $this->getScore(1); } $cache_size = floor(count($this->dictionary_file['dic']) * $this->cache_pct); if ($cache_size == 0) { $cache_size = 1; } preg_match_all('/./u', trim($sentence), $matches); $characters = $matches[0]; if (!count($characters)) { return []; } $ner_dict=[]; if (isset($this->NER)) { $named_entities=$this->NER->predict($characters); foreach($named_entities as $e) { $this->add($e[0],1,$ner_dict); } } $score = []; $path = []; //init base $score[-1] = 0; for($index = 0; $index < count($characters); $index++) { //If not current language if ($this->notCurrentLang($characters[$index]) && !$this->isPunctuation($characters[$index])) { $current_char = $characters[$index]; for($j = $index + 1; $j < count($characters); $j++) { if ($this->notCurrentLang($current_char.$characters[$j]) && !$this->isPunctuation($characters[$j])) { $current_char .= $characters[$j]; } else { break; } } if (!isset($score[$j - 1]) || $score[$j - 1] > $score[$index - 1] + $this->unknown_term_score) { $score[$j - 1] = $score[$index - 1] + $this->unknown_term_score; $path[$j - 1] = $index - 1; } } //If date or number if ($this->isException($characters[$index]) ) { $current_char = $characters[$index]; for($j = $index+1; $j<count($characters); $j++) { if (!$this->isException( $current_char . $characters[$j])) { break; } $current_char .= $characters[$j]; } if (!isset($score[$j - 1]) || $score[$j - 1] > $score[$index - 1] + $this->unknown_term_score) { $score[$j - 1] = $score[$index - 1] + $this->unknown_term_score; $path[$j - 1] = $index - 1; } } //If is punctuation, give slightly better score than unknown words if ($this->isPunctuation($characters[$index])) { $current_char = $characters[$index]; for($j = $index+1; $j<count($characters); $j++) { if (!$this->isPunctuation( $current_char . $characters[$j])) { break; } $current_char .= $characters[$j]; } if (!isset($score[$j - 1]) || $score[$j - 1] > $score[$index - 1] + $this->unknown_term_score / 1.1) { $score[$j - 1] = $score[$index - 1] + $this->unknown_term_score / 1.1; $path[$j - 1] = $index - 1; } } /* All case (Even not in current lang because dictionary may contains those terms check the first char, give score even nothing matches */ if (!isset($score[$index]) || $score[$index-1] + $this->unknown_term_score < $score[$index]) { $score[$index] = $score[$index-1] + $this->unknown_term_score; $path[$index] = $index - 1; } //if entry exists, look for the term if (isset($this->dictionary_file["dic"][$characters[$index]])) { if (!isset($this->cache[$characters[$index]])) { $this->cache = [$characters[$index] => json_decode( $this->dictionary_file["dic"][$characters[$index]], true)] + $this->cache; while (count($this->cache) > $cache_size) { array_pop($this->cache); } } $subdic = $this->cache; for ($j = $index; $j < count($characters); $j++) { if (!isset($subdic[$characters[$j]])) { break; } $subdic = $subdic[$characters[$j]]; if (isset($subdic['$']) && (!isset($score[$j]) || (isset($score[$index - 1]) && $score[$index - 1] + $subdic['$'] < $score[$j]))) { $score[$j] = $score[$index - 1] + $this->getScore($subdic['$']); $path[$j] = $index - 1; } } } //check NER dictionary if (isset($ner_dict[$characters[$index]])) { $subdic = $ner_dict; for ($j = $index; $j < count($characters); $j++) { if (!isset($subdic[$characters[$j]])) { break; } $subdic = $subdic[$characters[$j]]; if (isset($subdic['$']) && (!isset($score[$j]) || (isset($score[$index - 1]) && $score[$index - 1] + $subdic['$'] < $score[$j]))) { $score[$j] = $score[$index - 1] + $this->getScore($subdic['$']); $path[$j] = $index - 1; } } } } //trace path $t = max(array_keys($path)); $tmp = []; while($t != -1) { $tmp[] = $t; $t = $path[$t]; } $result = []; $t = 0; foreach(array_reverse($tmp) as $nextnode) { $result_word = ""; while($t <= $nextnode) { $result_word .= $characters[$t]; $t++; } $result[] = $result_word; } return $result; } /** * This is the function to calculate scores for each word * @param int $frequency is an integer tells the frequency of a word * @return float the score of the term. */ private function getScore($frequency) { if (!empty($this->dictionary_file["N"]) && is_numeric($this->dictionary_file["N"])) { return -log($frequency / $this->dictionary_file["N"]); } else { return 0; } } /** * Adds a term to the dictionary * * @param string $key the term to be inserted * @param string $value the frequency to be inserted * @param array $array for insertion */ private function add($key, $value, & $array) { $trie_array = & $array; for ($i = 0; $i < mb_strlen($key,"utf-8"); $i++) { $character = mb_substr($key, $i, 1, "utf-8"); $enc_char = $character; // If letter doesnt exist then create one by // assigning new array if (!isset($trie_array[$enc_char])) { $trie_array[$enc_char] = []; } $trie_array = & $trie_array[$enc_char]; } // Set end of term marker $trie_array['$'] = $value; } }