Last commit for src/library/ContextWeightedNamedEntityRecognizer.php: fe626fd18628fbe59d8faa0a1af330c85d910ffa

Add all missing comments/documentation for Yioop 7.0, Refactor StochasticSegmentor, POS, and Named Entity code, Fix bug in Email Message urls, Fix paging links in Manageuers, a=chris

Chris Pollett [2020-07-10 23:Jul:th]

Add all missing comments/documentation for Yioop 7.0, Refactor StochasticSegmentor, POS, and Named Entity code, Fix bug in Email Message urls, Fix paging links in Manageuers, a=chris

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2019  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * @author Xianghong Sun sxh19911230@gmail.com
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2019
 * @filesource
 */
namespace seekquarry\yioop\library;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\locale\zh_CN\resources as ZH;

/**
 * Machine learning based NER tagger. Typically, ContextWeightedNERTagger.php
 * can train the language with some dataset and predict
 * the tag given a list of word.
 *
 * @author Xianghong Sun
 */
class ContextWeightedNamedEntityRecognizer
{
    /**
     * Current Language, only tested on Simplified Chinese
     * Might be extensable for other languages in the furture
     * @var string
     */
    public $lang;
    /**
     * The word weight feature
     * y = wx + b
     * Generized by training method
     * @var array
     */
    public $word_feature;
    /**
     * The tag weight feature
     * y = wx + b
     * Generized by training method
     * @var array
     */
    public $tag_feature;
    /**
     * The bias
     * y = wx + b
     * Generized by training method
     * @var array
     */
    public $bias;
     /**
     * All Possiable tag set
     * Generized by training method
     * @var associative array [tag => tag index]
     */
    private $tag_set;
     /**
     * The constructer of the pos tagger
     * To extend to other languages, some work are needed:
     * Define $this->getKeyImpl, $this->rule_defined_key
     * See Chinese example.
     * @param @string $lang describes current langauge
     * @param @book $packed describes how weight and bias would look like
     */
    public function __construct($lang)
    {
        switch($lang) {
            case("zh_CN"):
            case("zh-CH"):
                $this->lang = "zh_CN";
                break;
            default:
                $this->lang = $lang;
        }
    }

    /**
     * A function that process the trainning data
     * @param @mixed $text_files can be a file or an array of file names
     * @return @array of seperated sentences, each sentenfce have the format of
     *               [[words...],[tags...]]
     * Data format MSRA:
     * 我们/o 是/o 受到/o 郑振铎/nr 先生/o 、/o 阿英/nr 先生/o 著作/o 的/o
     * 启示/o ，/o 从/o 个人/o 条件/o 出发/o ，/o 瞄准/o 现代/o 出版/o 史/o
     * 研究/o 的/o 空白/o ，/o 重点/o 集/o 藏/o 解放区/o 、/o 国民党/nt 毁/o
     * 禁/o 出版物/o 。/o
     * To adapt to other language, some modifications are needed
     */
    public static function processTexts($text_files, $term_tag_splier="/",
        $term_process = null, $tag_process = null)
    {
        $ret=[];
        foreach($text_files as $text_file) {
            if (file_exists($text_file)) {
                $fn = fopen($text_file,"r");
                while(! feof($fn))  {
                    $line = fgets($fn);
                    if(strpos($line, '<') !== false) {
                        continue;
                    }
                    $word_tag_pairs = preg_split("/[\s　]+/u", $line);
                    if (!count($word_tag_pairs)) {
                        continue;
                    }
                    $ret[] = [];
                    $ret[count($ret)-1][0] = [];
                    $ret[count($ret)-1][1] = [];
                    foreach ($word_tag_pairs as $word_tag_pair) {
                        $t = explode("/", $word_tag_pair);
                        if (count($t) == 2) {
                            $tag = $tag_process ? $tag_process($t[1]) : $t[1];
                            foreach(preg_split('//u', $t[0], null,
                                PREG_SPLIT_NO_EMPTY) as $ch) {
                                $ret[count($ret)-1][0][] =
                                    $term_process ? $term_process($ch) : $ch;
                                $ret[count($ret)-1][1][] = $tag;
                            }
                        }
                    }
                }
                fclose($fn);
            }
        }
        return $ret;
    }

    /**
    * Function to train a data
    * Notice: This function might run very long time, depending on training set
    * @param @mixed $text_files are training data
    *               can be a file or an array of file names
    * @param @float $learning_rate
    * @param @int  $max_epoch 1200 might be a good one,
    *           the weight will overfit if it's greater than this number
    * @param @function $term_process is a preporcess on term before training
    * @param @function $tag_process is a preporcess on tag before training
    */
    public function train($text_files, $learning_rate=0.1, $max_epoch = 1200,
        $term_process = null, $tag_process = null)
    {
        if (is_string($text_files)) {
            $text_files = [$text_files];
        }
        echo "Reading files\n";
        // term_tag_sentences[sentence#]=[[words...],[tags...]]
        $term_tag_sentences = self::processTexts($text_files,
            $term_process, $tag_process);
        $this->word_feature=[];
        $this->tag_set=[];
        $tag_index = 0;
        for ($i = -4; $i <= -1; $i++) {
            $this->word_feature[$i] = [];
        }
        foreach ($term_tag_sentences as $term_tag_pairs) {
            $terms=$term_tag_pairs[0];
            $tags=$term_tag_pairs[1];
            $this->tag_feature["start"]=[];
            $this->tag_feature["start-start"]=[];
            for ($i = 0; $i < count($terms); $i++) {
                if (!isset($this->tag_set[$tags[$i]])) {
                    $this->tag_set[$tags[$i]] = $tag_index++;
                }
                if ($i == 0) {}
                else if ($i == 1) {
                    if (!isset($this->tag_feature["start-".$tags[$i-1]])) {
                        $this->tag_feature["start-".$tags[$i-1]]=[];
                    }
                    if (!isset($this->tag_feature[$tags[$i-1]])) {
                        $this->tag_feature[$tags[$i-1]]=[];
                    }
                } else {
                    if (!isset($this->tag_feature[$tags[$i-2] . "-" .
                        $tags[$i-1]])) {
                        $this->tag_feature[$tags[$i-2]."-".$tags[$i-1]] = [];
                    }
                    if (!isset($this->tag_feature[$tags[$i-1]])) {
                        $this->tag_feature[$tags[$i-1]]=[];
                    }
                }
                if (!isset($this->word_feature[$terms[$i]])) {
                    $this->word_feature[$terms[$i]] = [];
                }
            }
        }
        foreach (array_keys($this->word_feature) as $key) {
            for ($i=-2; $i<=2;$i++) {
                if (!isset($this->word_feature[$key][$i])) {
                    $this->word_feature[$key][$i] = [];
                }
                foreach($this->tag_set as $possiable_tag => $tag_index) {
                    if (!isset($this->word_feature[$key][$i][$tag_index])) {
                        $this->word_feature[$key][$i][$tag_index] = 0;
                    }
                }
            }
        }
        foreach (array_keys($this->tag_feature) as $key) {
            foreach($this->tag_set as $possiable_tag => $tag_index) {
                if (!isset($this->tag_feature[$key][$tag_index])) {
                    $this->tag_feature[$key][$tag_index] = 0;
                }
            }
        }
        foreach($this->tag_set as $possiable_tag => $tag_index) {
            if (!isset($this->bias[$tag_index])) {
                $this->bias[$tag_index] = 0;
            }
        }
        echo "Training...\n";
        //train the weight
        $cross_entropy_loss = 1;
        $pre_cross_entropy_loss = 2;
        for ($epoch = 0; ($epoch < $max_epoch) &&
            $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001;
            $epoch++) {
            $this->min_w=0;
            $this->max_w=0;
            $time = time();
            $dy_dw = [];
            $dy_dw_n = [];
            $pre_cross_entropy_loss = $cross_entropy_loss;
            $cross_entropy_loss = 0;
            $cross_entropy_loss_n = 0;

            $dy_db=[];
            $dy_db_n=[];

            $dy_dt=[];
            $dy_dt_n=[];
            for($i = 0; $i < count($this->tag_set); $i++) {
                $dy_db[$i] = 0;
                $dy_db_n[$i] = 0;
            }
            //for each sentence
            foreach ($term_tag_sentences as $term_tag_pairs) {
                $terms=$term_tag_pairs[0];
                $tags=$term_tag_pairs[1];
                for ($i = 0; $i < count($terms); $i++) {
                    $k=[];
                    for ($j=-2; $j<=2;$j++) {
                        $k[$j]= $this->getIndex($i+$j,$terms);
                    }
                    foreach ($this->tag_set as $possiable_tag => $tag_index) {
                        $equality = $possiable_tag == $tags[$i] ? 1 : 0;
                        $sum=0;
                        //5 words including itself
                        for ($j=-2; $j<=2;$j++) {
                            $sum += $this->word_feature[$k[$j]][$j][$tag_index];
                        }
                        //previous 2 tags
                        if ($i == 0) {
                            $tf1="start";
                            $tf2="start-start";
                        } else if ($i == 1) {
                            $tf1=$tags[$i-1];
                            $tf2="start-".$tags[$i-1];
                        } else {
                            $tf1=$tags[$i-1];
                            $tf2=$tags[$i-2]."-".$tags[$i-1];
                        }
                        $sum += $this->tag_feature[$tf1][$tag_index];
                        $sum += $this->tag_feature[$tf2][$tag_index];
                        //bias
                        $sum += $this->bias[$tag_index];
                        $sigmoid = 1 / (1 + exp(-1 * $sum));
                        for ($j=-2; $j<=2;$j++) {
                            if (!isset($dy_dw[$k[$j]])) {
                                $dy_dw[$k[$j]] = [];
                                $dy_dw_n[$k[$j]] = [];
                            }
                            if (!isset($dy_dw[$k[$j]][$j])) {
                                $dy_dw[$k[$j]][$j] = [];
                                $dy_dw_n[$k[$j]][$j] = [];
                            }
                            if (!isset($dy_dw[$k[$j]][$j][$tag_index])) {
                                $dy_dw[$k[$j]][$j][$tag_index] = 0;
                                $dy_dw_n[$k[$j]][$j][$tag_index] = 0;
                            }

                            $dy_dw[$k[$j]][$j][$tag_index] +=
                                ($sigmoid - $equality);
                            $dy_dw_n[$k[$j]][$j][$tag_index] += 1;

                        }
                        //dy_dt
                        if (!isset($dy_dt[$tf1])) {
                            $dy_dt[$tf1] = [];
                            $dy_dt_n[$tf1] = [];
                        }
                        if (!isset($dy_dt[$tf1][$tag_index])) {
                            $dy_dt[$tf1][$tag_index] = 0;
                            $dy_dt_n[$tf1][$tag_index] = 0;
                        }
                        if (!isset($dy_dt[$tf2])) {
                            $dy_dt[$tf2] = [];
                            $dy_dt_n[$tf2] = [];
                        }
                        if (!isset($dy_dt[$tf2][$tag_index])) {
                            $dy_dt[$tf2][$tag_index] = 0;
                            $dy_dt_n[$tf2][$tag_index] = 0;
                        }
                        $dy_dt[$tf1][$tag_index] += ($sigmoid - $equality);
                        $dy_dt_n[$tf1][$tag_index] += 1;
                        $dy_dt[$tf2][$tag_index] += ($sigmoid - $equality);
                        $dy_dt_n[$tf2][$tag_index] += 1;
                        //dy_db
                        $dy_db[$tag_index] += ($sigmoid - $equality);
                        $dy_db_n[$tag_index] += 1;
                        $cross_entropy_loss+=
                            - $equality*log($sigmoid)
                            - (1-$equality)*log(1-$sigmoid);
                        $cross_entropy_loss_n++;
                    }
                }
            }
            $cross_entropy_loss /= $cross_entropy_loss_n;
            $duration = time() - $time;
            echo "epoch {$epoch} cross_entropy {$cross_entropy_loss}".
                " Takes {$duration} seconds\n";
            foreach ($dy_dw as $i =>$v1) {
                foreach ($v1 as $j =>$v2) {
                    foreach ($v2 as $k =>$v3) {
                        $this->word_feature[$i][$j][$k] -=
                            $dy_dw[$i][$j][$k] /
                            $dy_dw_n[$i][$j][$k] *
                            $learning_rate;
                        if ($this->word_feature[$i][$j][$k] < $this->min_w) {
                            $this->min_w = $this->word_feature[$i][$j][$k];
                        }
                        if ($this->word_feature[$i][$j][$k] > $this->max_w) {
                            $this->max_w = $this->word_feature[$i][$j][$k];
                        }
                    }
                }
            }
            foreach ($dy_dt as $i => $v1) {
                foreach ($v1 as $j => $v2) {
                    $this->tag_feature[$i][$j] -=
                        $dy_dt[$i][$j] /
                        $dy_dt_n[$i][$j] *
                        $learning_rate;
                }
            }
            foreach ($dy_db as $k => $v) {
                $this->bias[$k]-=
                    $dy_db[$k] /
                    $dy_db_n[$k] *
                    $learning_rate;
            }
            if ($epoch % 10 == 9 ) {
                $this->save_weight();
            }
        }
        $this->save_weight();
        return true;
    }
    /**
     * The primary function to predit the tag
     * @param mixed $sentence is an array of segmented words/terms
     *     or a string needs to be splited by $splitter
     * @param function $splitter to process $sentence if $sentence
     *                 is a string
     * @return @array all predicted named entities with its tag
     *                ex. [["郑振铎","nr"],["国民党","nt"]]
     */
    public function predict($sentence, $delimiter="",$splitter=null)
    {
        if (!is_array($sentence)) {
            if ($sentence == "") {
                $terms=[];
            } else {
                $terms=preg_split("/[\s]+/",$sentence);
            }
        } else {
            $terms=$sentence;
        }
        if (!count($terms)) {
            return [];
        }
        if (!$this->word_feature) {
            $this->load_weight();
        }
        $result = [];
        for($i = 0; $i < count($terms); $i++) {
            $term = $terms[$i];
            $score =[];
            foreach($this->tag_set as $possiable_tag => $tag_index) {
                $score[$possiable_tag]=0;
                for ($j=-2; $j <=2; $j++) {
                    $k=$this->getIndex($i+$j, $terms);
                    if (isset($this->word_feature[$k])) {
                        $score[$possiable_tag] +=
                                $this->getW($k,$j,$tag_index);
                    }
                }
                if ($i == 0) {
                    $tf1="start";
                    $tf2="start-start";
                } else if ($i == 1) {
                    $tf1=$result[$i-1];
                    $tf2="start-".$result[$i-1];
                } else {
                    $tf1=$result[$i-1];
                    $tf2=$result[$i-2]."-".$result[$i-1];
                }
                $score[$possiable_tag] += $this->getT($tf1,$tag_index);
                $score[$possiable_tag] += $this->getT($tf2,$tag_index);
                $score[$possiable_tag] += $this->getB($tag_index);
            }
            $result[]=array_keys($score, max($score))[0];
        }
        $pre_tag='o';
        $current_entity=null;
        $ret=[];
        for ($i = 0; $i < count($terms); $i++) {
            if ($pre_tag != $result[$i] && $pre_tag != "o") {
                if (mb_strlen($current_entity) < 10) {
                    $ret[]=[$current_entity,$pre_tag];
                }
                $current_entity=null;
            }
            if ($result[$i] != "o") {
                if ($current_entity) {
                    $current_entity.=$delimiter.$terms[$i];
                } else {
                    $current_entity=$terms[$i];
                }
            }
            $pre_tag=$result[$i];
        }
        return $ret;
    }
    /**
     * A list of private helper functions
     * Given a setence ($term), find the key at position $index
     */
    private function getIndex($index, $terms)
    {
        if ($index < 0) $k = $index - 2;
        else if ($index >= count($terms)) {
            $k = $index - count($terms) - 2;
        }
        else {
            $k = $terms[$index];
        }
        return $k;
    }

    /**
     * save the trained weight to disk
     */
    private function save_weight()
    {
        $out_file = C\LOCALE_DIR . "/{$this->lang}/resources/ner_weight.txt.gz";
        $out = [];
        $out["min_w"] = $this->min_w;
        $out["max_w"] = $this->max_w;
        $out["w"]=[];
        foreach(array_keys($this->word_feature) as $key) {
            $out["w"][$key] = $this->pack_w($key);
        }
        foreach(array_keys($this->tag_feature) as $key) {
            $out["t"][$key] = $this->pack_t($key);
        }
        $out["b"] = $this->pack_b();
        $out["tag_set"] = $this->tag_set;
        echo "Saving...";
        file_put_contents($out_file,
            gzencode(serialize($out),9));
        echo " ok\n";
    }
    /**
     * load the trained weight from disk
     */
    private function load_weight($trainning_load=false)
    {
        $dic_file
            = C\LOCALE_DIR . "/{$this->lang}/resources/ner_weight.txt.gz";
        if (!file_exists($dic_file)) {
            echo "$dic_file does not exist!";
            exit();
        }
        $f = unserialize(gzdecode(file_get_contents($dic_file))
            ,['allowed_classes' => false]);
        $this->word_feature=$f["w"];
        $this->tag_feature=$f["t"];
        $this->bias=$f["b"];
        $this->min_w=$f["min_w"];
        $this->max_w=$f["max_w"];
        $this->tag_set=$f["tag_set"];
        if ($trainning_load) {
            foreach(array_keys($this->word_feature) as $key) {
                $this->word_feature[$key] = $this->unpack_w($key);
            }
            foreach(array_keys($this->tag_feature) as $key) {
                $this->tag_feature[$key] = $this->unpack_t($key);
            }
            $this->bias = $this->unpack_b();
        }
    }
    /**
     * Pack the bias
     */
    private function pack_b()
    {
        return pack("f*", ...$this->bias);
    }
    /**
     * Unpack the bias
     */
    private function unpack_b()
    {
        return array_merge(unpack("f" . strval(count($this->tag_set)),
            $this->bias));
    }
    /**
     * Pack the tag_feature
     */
    private function pack_t($key)
    {
        return pack("f*", ...$this->tag_feature[$key]);
    }
    /**
     * Unpack the tag_feature
     */
    private function unpack_t($key)
    {
        return array_merge(unpack("f".strval(count($this->tag_set)),
            $this->tag_feature[$key]));
    }
    /**
     * Pack the word_feature
     */
    private function pack_w($key)
    {
        $bin_str = "";
        foreach($this->word_feature[$key] as $i => $t) {
            foreach($t as $u) {
                $v = 65535 * ($u-$this->min_w) / ($this->max_w-$this->min_w);
                $bin_str .= pack("S", intval($v));
            }
        }
        return $bin_str;
    }
    /**
     * Unpack the word_feature
     */
    private function unpack_w($key)
    {
        $tmp = [];
        $size = count($this->tag_set);
        for ($i = 0; $i < 5; $i++) {
            $tmp[$i-2] = array_merge(unpack("S".strval($size),
                $this->word_feature[$key], 2*$i*count($this->tag_set)));
            for($j = 0; $j < $size; $j++) {
                $tmp[$i-2][$j] = $tmp[$i-2][$j] / 65535
                    * ($this->max_w-$this->min_w) + $this->min_w;
            }
        }
        return $tmp;
    }
    /**
     * Get the bias value for tag
     */
    private function getB($tag_index)
    {
        return unpack("f",$this->bias,$tag_index*4)[1];
    }
    /**
     * Get the bias value for tag
     */
    private function getT($key, $tag_index)
    {
        return unpack("f",$this->tag_feature[$key],$tag_index*4)[1];
    }
    /**
     * Get the weight value for term at postion for tag
     */
    private function getW($term, $position, $tag_index)
    {
        $t = unpack("S",$this->word_feature[$term],
            2*($position+2)*count($this->tag_set)+$tag_index*2)[1]
            / 65535
            * ($this->max_w-$this->min_w) + $this->min_w;;
        return $t;
    }
}

ViewGit