<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2019 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * @author Xianghong Sun sxh19911230@gmail.com * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2019 * @filesource */ namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; use seekquarry\yioop\locale\zh_CN\resources as ZH; /** * Machine learning based NER tagger. Typically, ContextWeightedNERTagger.php * can train the language with some dataset and predict * the tag given a list of word. * * @author Xianghong Sun */ class ContextWeightedNamedEntityRecognizer { /** * Current Language, only tested on Simplified Chinese * Might be extensable for other languages in the furture * @var string */ public $lang; /** * The word weight feature * y = wx + b * Generized by training method * @var array */ public $word_feature; /** * The tag weight feature * y = wx + b * Generized by training method * @var array */ public $tag_feature; /** * The bias * y = wx + b * Generized by training method * @var array */ public $bias; /** * All Possiable tag set * Generized by training method * @var associative array [tag => tag index] */ private $tag_set; /** * The constructer of the pos tagger * To extend to other languages, some work are needed: * Define $this->getKeyImpl, $this->rule_defined_key * See Chinese example. * @param @string $lang describes current langauge * @param @book $packed describes how weight and bias would look like */ public function __construct($lang) { switch($lang) { case("zh_CN"): case("zh-CH"): $this->lang = "zh_CN"; break; default: $this->lang = $lang; } } /** * A function that process the trainning data * @param @mixed $text_files can be a file or an array of file names * @return @array of seperated sentences, each sentenfce have the format of * [[words...],[tags...]] * Data format MSRA: * 我们/o 是/o 受到/o 郑振铎/nr 先生/o 、/o 阿英/nr 先生/o 著作/o 的/o * 启示/o ,/o 从/o 个人/o 条件/o 出发/o ,/o 瞄准/o 现代/o 出版/o 史/o * 研究/o 的/o 空白/o ,/o 重点/o 集/o 藏/o 解放区/o 、/o 国民党/nt 毁/o * 禁/o 出版物/o 。/o * To adapt to other language, some modifications are needed */ public static function processTexts($text_files, $term_tag_splier="/", $term_process = null, $tag_process = null) { $ret=[]; foreach($text_files as $text_file) { if (file_exists($text_file)) { $fn = fopen($text_file,"r"); while(! feof($fn)) { $line = fgets($fn); if(strpos($line, '<') !== false) { continue; } $word_tag_pairs = preg_split("/[\s ]+/u", $line); if (!count($word_tag_pairs)) { continue; } $ret[] = []; $ret[count($ret)-1][0] = []; $ret[count($ret)-1][1] = []; foreach ($word_tag_pairs as $word_tag_pair) { $t = explode("/", $word_tag_pair); if (count($t) == 2) { $tag = $tag_process ? $tag_process($t[1]) : $t[1]; foreach(preg_split('//u', $t[0], null, PREG_SPLIT_NO_EMPTY) as $ch) { $ret[count($ret)-1][0][] = $term_process ? $term_process($ch) : $ch; $ret[count($ret)-1][1][] = $tag; } } } } fclose($fn); } } return $ret; } /** * Function to train a data * Notice: This function might run very long time, depending on training set * @param @mixed $text_files are training data * can be a file or an array of file names * @param @float $learning_rate * @param @int $max_epoch 1200 might be a good one, * the weight will overfit if it's greater than this number * @param @function $term_process is a preporcess on term before training * @param @function $tag_process is a preporcess on tag before training */ public function train($text_files, $learning_rate=0.1, $max_epoch = 1200, $term_process = null, $tag_process = null) { if (is_string($text_files)) { $text_files = [$text_files]; } echo "Reading files\n"; // term_tag_sentences[sentence#]=[[words...],[tags...]] $term_tag_sentences = self::processTexts($text_files, $term_process, $tag_process); $this->word_feature=[]; $this->tag_set=[]; $tag_index = 0; for ($i = -4; $i <= -1; $i++) { $this->word_feature[$i] = []; } foreach ($term_tag_sentences as $term_tag_pairs) { $terms=$term_tag_pairs[0]; $tags=$term_tag_pairs[1]; $this->tag_feature["start"]=[]; $this->tag_feature["start-start"]=[]; for ($i = 0; $i < count($terms); $i++) { if (!isset($this->tag_set[$tags[$i]])) { $this->tag_set[$tags[$i]] = $tag_index++; } if ($i == 0) {} else if ($i == 1) { if (!isset($this->tag_feature["start-".$tags[$i-1]])) { $this->tag_feature["start-".$tags[$i-1]]=[]; } if (!isset($this->tag_feature[$tags[$i-1]])) { $this->tag_feature[$tags[$i-1]]=[]; } } else { if (!isset($this->tag_feature[$tags[$i-2] . "-" . $tags[$i-1]])) { $this->tag_feature[$tags[$i-2]."-".$tags[$i-1]] = []; } if (!isset($this->tag_feature[$tags[$i-1]])) { $this->tag_feature[$tags[$i-1]]=[]; } } if (!isset($this->word_feature[$terms[$i]])) { $this->word_feature[$terms[$i]] = []; } } } foreach (array_keys($this->word_feature) as $key) { for ($i=-2; $i<=2;$i++) { if (!isset($this->word_feature[$key][$i])) { $this->word_feature[$key][$i] = []; } foreach($this->tag_set as $possiable_tag => $tag_index) { if (!isset($this->word_feature[$key][$i][$tag_index])) { $this->word_feature[$key][$i][$tag_index] = 0; } } } } foreach (array_keys($this->tag_feature) as $key) { foreach($this->tag_set as $possiable_tag => $tag_index) { if (!isset($this->tag_feature[$key][$tag_index])) { $this->tag_feature[$key][$tag_index] = 0; } } } foreach($this->tag_set as $possiable_tag => $tag_index) { if (!isset($this->bias[$tag_index])) { $this->bias[$tag_index] = 0; } } echo "Training...\n"; //train the weight $cross_entropy_loss = 1; $pre_cross_entropy_loss = 2; for ($epoch = 0; ($epoch < $max_epoch) && $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001; $epoch++) { $this->min_w=0; $this->max_w=0; $time = time(); $dy_dw = []; $dy_dw_n = []; $pre_cross_entropy_loss = $cross_entropy_loss; $cross_entropy_loss = 0; $cross_entropy_loss_n = 0; $dy_db=[]; $dy_db_n=[]; $dy_dt=[]; $dy_dt_n=[]; for($i = 0; $i < count($this->tag_set); $i++) { $dy_db[$i] = 0; $dy_db_n[$i] = 0; } //for each sentence foreach ($term_tag_sentences as $term_tag_pairs) { $terms=$term_tag_pairs[0]; $tags=$term_tag_pairs[1]; for ($i = 0; $i < count($terms); $i++) { $k=[]; for ($j=-2; $j<=2;$j++) { $k[$j]= $this->getIndex($i+$j,$terms); } foreach ($this->tag_set as $possiable_tag => $tag_index) { $equality = $possiable_tag == $tags[$i] ? 1 : 0; $sum=0; //5 words including itself for ($j=-2; $j<=2;$j++) { $sum += $this->word_feature[$k[$j]][$j][$tag_index]; } //previous 2 tags if ($i == 0) { $tf1="start"; $tf2="start-start"; } else if ($i == 1) { $tf1=$tags[$i-1]; $tf2="start-".$tags[$i-1]; } else { $tf1=$tags[$i-1]; $tf2=$tags[$i-2]."-".$tags[$i-1]; } $sum += $this->tag_feature[$tf1][$tag_index]; $sum += $this->tag_feature[$tf2][$tag_index]; //bias $sum += $this->bias[$tag_index]; $sigmoid = 1 / (1 + exp(-1 * $sum)); for ($j=-2; $j<=2;$j++) { if (!isset($dy_dw[$k[$j]])) { $dy_dw[$k[$j]] = []; $dy_dw_n[$k[$j]] = []; } if (!isset($dy_dw[$k[$j]][$j])) { $dy_dw[$k[$j]][$j] = []; $dy_dw_n[$k[$j]][$j] = []; } if (!isset($dy_dw[$k[$j]][$j][$tag_index])) { $dy_dw[$k[$j]][$j][$tag_index] = 0; $dy_dw_n[$k[$j]][$j][$tag_index] = 0; } $dy_dw[$k[$j]][$j][$tag_index] += ($sigmoid - $equality); $dy_dw_n[$k[$j]][$j][$tag_index] += 1; } //dy_dt if (!isset($dy_dt[$tf1])) { $dy_dt[$tf1] = []; $dy_dt_n[$tf1] = []; } if (!isset($dy_dt[$tf1][$tag_index])) { $dy_dt[$tf1][$tag_index] = 0; $dy_dt_n[$tf1][$tag_index] = 0; } if (!isset($dy_dt[$tf2])) { $dy_dt[$tf2] = []; $dy_dt_n[$tf2] = []; } if (!isset($dy_dt[$tf2][$tag_index])) { $dy_dt[$tf2][$tag_index] = 0; $dy_dt_n[$tf2][$tag_index] = 0; } $dy_dt[$tf1][$tag_index] += ($sigmoid - $equality); $dy_dt_n[$tf1][$tag_index] += 1; $dy_dt[$tf2][$tag_index] += ($sigmoid - $equality); $dy_dt_n[$tf2][$tag_index] += 1; //dy_db $dy_db[$tag_index] += ($sigmoid - $equality); $dy_db_n[$tag_index] += 1; $cross_entropy_loss+= - $equality*log($sigmoid) - (1-$equality)*log(1-$sigmoid); $cross_entropy_loss_n++; } } } $cross_entropy_loss /= $cross_entropy_loss_n; $duration = time() - $time; echo "epoch {$epoch} cross_entropy {$cross_entropy_loss}". " Takes {$duration} seconds\n"; foreach ($dy_dw as $i =>$v1) { foreach ($v1 as $j =>$v2) { foreach ($v2 as $k =>$v3) { $this->word_feature[$i][$j][$k] -= $dy_dw[$i][$j][$k] / $dy_dw_n[$i][$j][$k] * $learning_rate; if ($this->word_feature[$i][$j][$k] < $this->min_w) { $this->min_w = $this->word_feature[$i][$j][$k]; } if ($this->word_feature[$i][$j][$k] > $this->max_w) { $this->max_w = $this->word_feature[$i][$j][$k]; } } } } foreach ($dy_dt as $i => $v1) { foreach ($v1 as $j => $v2) { $this->tag_feature[$i][$j] -= $dy_dt[$i][$j] / $dy_dt_n[$i][$j] * $learning_rate; } } foreach ($dy_db as $k => $v) { $this->bias[$k]-= $dy_db[$k] / $dy_db_n[$k] * $learning_rate; } if ($epoch % 10 == 9 ) { $this->save_weight(); } } $this->save_weight(); return true; } /** * The primary function to predit the tag * @param mixed $sentence is an array of segmented words/terms * or a string needs to be splited by $splitter * @param function $splitter to process $sentence if $sentence * is a string * @return @array all predicted named entities with its tag * ex. [["郑振铎","nr"],["国民党","nt"]] */ public function predict($sentence, $delimiter="",$splitter=null) { if (!is_array($sentence)) { if ($sentence == "") { $terms=[]; } else { $terms=preg_split("/[\s]+/",$sentence); } } else { $terms=$sentence; } if (!count($terms)) { return []; } if (!$this->word_feature) { $this->load_weight(); } $result = []; for($i = 0; $i < count($terms); $i++) { $term = $terms[$i]; $score =[]; foreach($this->tag_set as $possiable_tag => $tag_index) { $score[$possiable_tag]=0; for ($j=-2; $j <=2; $j++) { $k=$this->getIndex($i+$j, $terms); if (isset($this->word_feature[$k])) { $score[$possiable_tag] += $this->getW($k,$j,$tag_index); } } if ($i == 0) { $tf1="start"; $tf2="start-start"; } else if ($i == 1) { $tf1=$result[$i-1]; $tf2="start-".$result[$i-1]; } else { $tf1=$result[$i-1]; $tf2=$result[$i-2]."-".$result[$i-1]; } $score[$possiable_tag] += $this->getT($tf1,$tag_index); $score[$possiable_tag] += $this->getT($tf2,$tag_index); $score[$possiable_tag] += $this->getB($tag_index); } $result[]=array_keys($score, max($score))[0]; } $pre_tag='o'; $current_entity=null; $ret=[]; for ($i = 0; $i < count($terms); $i++) { if ($pre_tag != $result[$i] && $pre_tag != "o") { if (mb_strlen($current_entity) < 10) { $ret[]=[$current_entity,$pre_tag]; } $current_entity=null; } if ($result[$i] != "o") { if ($current_entity) { $current_entity.=$delimiter.$terms[$i]; } else { $current_entity=$terms[$i]; } } $pre_tag=$result[$i]; } return $ret; } /** * A list of private helper functions * Given a setence ($term), find the key at position $index */ private function getIndex($index, $terms) { if ($index < 0) $k = $index - 2; else if ($index >= count($terms)) { $k = $index - count($terms) - 2; } else { $k = $terms[$index]; } return $k; } /** * save the trained weight to disk */ private function save_weight() { $out_file = C\LOCALE_DIR . "/{$this->lang}/resources/ner_weight.txt.gz"; $out = []; $out["min_w"] = $this->min_w; $out["max_w"] = $this->max_w; $out["w"]=[]; foreach(array_keys($this->word_feature) as $key) { $out["w"][$key] = $this->pack_w($key); } foreach(array_keys($this->tag_feature) as $key) { $out["t"][$key] = $this->pack_t($key); } $out["b"] = $this->pack_b(); $out["tag_set"] = $this->tag_set; echo "Saving..."; file_put_contents($out_file, gzencode(serialize($out),9)); echo " ok\n"; } /** * load the trained weight from disk */ private function load_weight($trainning_load=false) { $dic_file = C\LOCALE_DIR . "/{$this->lang}/resources/ner_weight.txt.gz"; if (!file_exists($dic_file)) { echo "$dic_file does not exist!"; exit(); } $f = unserialize(gzdecode(file_get_contents($dic_file)) ,['allowed_classes' => false]); $this->word_feature=$f["w"]; $this->tag_feature=$f["t"]; $this->bias=$f["b"]; $this->min_w=$f["min_w"]; $this->max_w=$f["max_w"]; $this->tag_set=$f["tag_set"]; if ($trainning_load) { foreach(array_keys($this->word_feature) as $key) { $this->word_feature[$key] = $this->unpack_w($key); } foreach(array_keys($this->tag_feature) as $key) { $this->tag_feature[$key] = $this->unpack_t($key); } $this->bias = $this->unpack_b(); } } /** * Pack the bias */ private function pack_b() { return pack("f*", ...$this->bias); } /** * Unpack the bias */ private function unpack_b() { return array_merge(unpack("f" . strval(count($this->tag_set)), $this->bias)); } /** * Pack the tag_feature */ private function pack_t($key) { return pack("f*", ...$this->tag_feature[$key]); } /** * Unpack the tag_feature */ private function unpack_t($key) { return array_merge(unpack("f".strval(count($this->tag_set)), $this->tag_feature[$key])); } /** * Pack the word_feature */ private function pack_w($key) { $bin_str = ""; foreach($this->word_feature[$key] as $i => $t) { foreach($t as $u) { $v = 65535 * ($u-$this->min_w) / ($this->max_w-$this->min_w); $bin_str .= pack("S", intval($v)); } } return $bin_str; } /** * Unpack the word_feature */ private function unpack_w($key) { $tmp = []; $size = count($this->tag_set); for ($i = 0; $i < 5; $i++) { $tmp[$i-2] = array_merge(unpack("S".strval($size), $this->word_feature[$key], 2*$i*count($this->tag_set))); for($j = 0; $j < $size; $j++) { $tmp[$i-2][$j] = $tmp[$i-2][$j] / 65535 * ($this->max_w-$this->min_w) + $this->min_w; } } return $tmp; } /** * Get the bias value for tag */ private function getB($tag_index) { return unpack("f",$this->bias,$tag_index*4)[1]; } /** * Get the bias value for tag */ private function getT($key, $tag_index) { return unpack("f",$this->tag_feature[$key],$tag_index*4)[1]; } /** * Get the weight value for term at postion for tag */ private function getW($term, $position, $tag_index) { $t = unpack("S",$this->word_feature[$term], 2*($position+2)*count($this->tag_set)+$tag_index*2)[1] / 65535 * ($this->max_w-$this->min_w) + $this->min_w;; return $t; } }