Last commit for src/locale/zh_CN/resources/Tokenizer.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\locale\zh_CN\resources;
use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\library\PhraseParser;

/**
 * Chinese specific tokenization code. Typically, tokenizer.php
 * either contains a stemmer for the language in question or
 * it specifies how many characters in a char gram
 *
 * @author Chris Pollett
 */

class Tokenizer
{
    /**
     * A list of frequently occurring terms for this locale which should
     * be excluded from certain kinds of queries. This is also used
     * for language detection
     * @var array
     */
    public static $stop_words = ['一', '人', '里', '会', '没', '她', '吗', '去',
        '也', '有', '这', '那', '不', '什', '个', '来', '要', '就', '我', '你',
        '的', '是', '了', '他', '么', '们', '在', '说', '为', '好', '吧', '知道',
        '我的', '和', '你的', '想', '只', '很', '都', '对', '把', '啊', '怎', '得',
        '还', '过', '不是', '到', '样', '飞', '远', '身', '任何', '生活', '够',
        '号', '兰', '瑞', '达', '或', '愿', '蒂', '別', '军', '正', '是不是',
        '证', '不用', '三', '乐', '吉', '男人', '告訴', '路', '搞', '可是',
        '与', '次', '狗', '决', '金', '史', '姆', '部', '正在', '活', '刚',
        '回家', '贝', '如何', '须', '战', '不會', '夫', '喂', '父', '亚', '肯定',
        '女孩', '世界'];
    /**
     * regular expression to determine if the None of the char in this
     * term is in current language.
     * @var string
     */
    public static $non_char_preg = "/^[^\p{Han}]+$/u";
    /**
     * The dictionary of characters can be used as Chinese Numbers
     * @var string
     */
    public static $num_dict =
       "1234567890○〇零一二两三四五六七八九十百千万亿".
       "0123456789壹贰叁肆伍陆柒捌玖拾廿卅卌佰仟萬億";
    /**
     * Dots used in Chinese Numbers
     * @var string
     */
    public static $dot = "\..点";
    /**
     * A list of characters can be used at the end of numbers
     * @var string
     */
    public static $num_end = "%%";
    /**
     * Exception words of the regex found by functions:
     * isCardinalNumber, isOrdinalNumber, isDate
     * ex. "十分" in most of time means "very", but it will
     * be determined to be "10 minutes" by the function so we
     * need to remove it
     * @var array of string
     */
    public static $exception_list= ["十分","一","一点","千万",
    "万一", "一一", "拾", "一时", "千千", "万万", "陆"];
    /**
     * A list of characters can be used as Chinese punctuations
     * @var string
     */
    public static $punctuation_preg =
    "/^([\x{2000}-\x{206F}\x{3000}-\x{303F}\x{FF00}-\x{FF0F}" .
    "\x{FF1A}-\x{FF20}\x{FF3B}-\x{FF40}\x{FF5B}-\x{FF65}" .
    "\x{FFE0}-\x{FFEE}\x{21}-\x{2F}\x{21}-\x{2F}" .
    "\x{3A}-\x{40}\x{5B}-\x{60}\x{25cf}])\\1*$/u";
    /**
     * Any unique identifier corresponding to the component of a triplet which
     * can be answered using a question answer list
     * @var string
     */
    public static $question_token = "qqq";
    /**
     * Words array that determine if a sentence passed in is a question
     * @var array
     */
    public static $question_words = [
        "any" => ["谁" => "who",
                "哪儿|哪里" => "where",
                "哪个" => "which",
                "哪些" => "list",
                "哪" => ["after" => [   "1|一"=>"which",
                                    "[2-9]|[1-9][0-9]+"=>"list"
                                ],
                       "other"=>"where"
                        ],
                "什么|啥|咋" => [ "after" => [    "地方"=>"where",
                                            "地点"=>"where",
                                            "时\w*"=>"when"
                                       ],
                                "other" => "what"],
                "怎么|怎样|怎么样|如何" => "how",
                "为什么" => "why",
                "多少" => "how many",
                "几\w*" => ["any" => ["吗|\?|?" => "how many"],
                    "other" => false],
                "多久" => "how long",
                "多大" => "how big"
                ],
        "other" => [  "any" => [    "吗"=>"yesno",
                                "呢" => "what about"
                           ],
                    "other" => [  "other" => false,
                                "any" => ["\?|?" => "yesno"]
                             ]
                 ]
        ];
    /**
     * List of adjective-like parts of speech that might appear in lexicon file
     * Predicative adjective: VA
     * other noun-modifier: JJ
     * @var array
     */
    public static $adjective_type = ["VA", "JJ"];
    /**
     * List of adverb-like parts of speech that might appear in lexicon file
     * @var array
     */
    public static $adverb_type = ["AD"];
    /**
     * List of conjunction-like parts of speech that might appear in lexicon
     * file
     * Coordinating conjunction: CC
     * Subordinating conjunction: CS
     * @var array
     */
    public static $conjunction_type = ["CC", "CS"];
    /**
     * List of determiner-like parts of speech that might appear in lexicon
     * file
     * Determiner: DT
     * Cardinal Number: CD
     * Ordinal Number: OD
     * Measure word: M
     * @array
     */
    public static $determiner_type = ["DT", "CD", "OD", "M"];
    /**
     * List of noun-like parts of speech that might appear in lexicon file
     * Proper Noun: NR
     * Temporal Noun: NT
     * Other Noun: NN
     * Pronoun: PN
     * @var array
     */
    public static $noun_type = ["NR", "NT", "NN", "PN"];
    /**
     * List of verb-like parts of speech that might appear in lexicon file
     * Copula: VC
     * you3 as the main verb: VE
     * Other verb: VV
     * Short passive voice: SB
     * Long passive voice: LB
     * @var array
     */
    public static $verb_type = ["VC", "VE", "VV", "SB", "LB"];
    /**
     * List of particle-like parts of speech that might appear in lexicon file
     * No meaning words that can appear anywhere
     * @var array
     */
    public static $particle_type = [
        "AS", "ETC", "DEC", "DEG", "DEV", "MSP",
        "DER", "SP", "IJ", "FW"];
    /**
     * StochasticTermSegmenter instance used for segmenting chines
     * @var object
     */
    private static $stochastic_term_segmenter;
    /**
     * Named Entity tagger instance used to recognizer noun entities in
     * Chinese text
     * @var object
     */
    private static $named_entity_tagger;
    /**
     * PartOfSpeechContextTagger instance used in adding part of speech
     * annotations to Chinese text
     * @var object
     */
    private static $pos_tagger;
    /**
     * Holds a associative array with keys which are traditional characters
     * and values their simplified character correspondents.
     * @var array
     */
    private static $traditional_simplified_map;
    /**
     * Removes the stop words from the page (used for Word Cloud generation
     * and language detection)
     *
     * @param mixed $data either a string or an array of string to remove
     *      stop words from
     * @return mixed $data with no stop words
     */
    public static function stopwordsRemover($data)
    {
        static $pattern = "";
        if (empty($pattern)) {
            $pattern = '/(' . implode('|', self::$stop_words) . ')/u';
        }
        $data = self::normalize($data);
        $data = preg_replace($pattern, '', $data);
        return $data;
    }
    /**
     * A word segmenter.
     * Such a segmenter on input thisisabunchofwords would output
     * this is a bunch of words
     *
     * @param string $pre_segment  before segmentation
     * @param string $method  indicates which method to use
     * @return string with words separated by space
     */
    public static function segment($pre_segment, $method = "STS")
    {
        switch($method)
        {
            case "RMM":
                return PhraseParser::reverseMaximalMatch($pre_segment, "zh-CN",
                ['/^\d+$/', '/^[a-zA-Z]+$/']);
            case "STS":
                return self::getStochasticTermSegmenter()
                    ->segmentText($pre_segment);
        }
    }
    /**
     * Check if the term passed in is a Cardinal Number
     * @param string $term to check if a cardinal number or not
     * @return bool whether it is a cardinal or not
     */
    public static function isCardinalNumber($term)
    {
        return !in_array($term,self::$exception_list)
            && preg_match("/^([" . self::$num_dict .
            "]+([" . self::$dot . "][" .self::$num_dict .
            "]+)?[" . self::$num_end .
            "]?[余餘多]?[百千万亿佰仟萬億]*)".
            "$|^([".self::$num_dict."]+分之[" . self::$num_dict .
            "]+([" . self::$dot . "][" .self::$num_dict .
            "]+)?)$/u", $term);
    }
    /*
     * Check if the term passed in is a Ordinal Number
     * @param string $term to check if a ordinal number or not
     * @return bool whether it is a ordinal or not
     */
    public static function isOrdinalNumber($term)
    {
        return !in_array($term,self::$exception_list)
            && preg_match("/^第[" . self::$num_dict .
            "]*$/u", $term);
    }
    /*
     * Check if the term passed in is a date
     * @param string $term to check if a date or not
     * @return bool whether it is a date or not
     */
    public static function isDate($term)
    {
        return !in_array($term,self::$exception_list)
            && preg_match("/^[" . self::$num_dict .
            "]+(年|年代|月|日|时|小时|時|小時|" .
            "点|点钟|點|點鐘|分|分鐘|秒|秒鐘)$/u",$term);
    }
    /**
     * Check if the term is a punctuation
     */
    public static function isPunctuation($term)
    {
        return preg_match(self::$punctuation_preg, $term);
    }
    /**
     * Check if all the chars in the term is NOT current language
     * @param string $term is a string that to be checked
     * @return bool true if all the chars in $term is NOT current language
     *         false otherwise
     */
    public static function isNotCurrentLang($term)
    {
        return preg_match(self::$non_char_preg, $term);
    }
    /**
     * Get the segmenter instance, instantiating it if necessary
     * @return StochasticTermSegmenter
     */
    public static function getStochasticTermSegmenter()
    {
        if (!self::$stochastic_term_segmenter) {
            self::$stochastic_term_segmenter
                = new L\StochasticTermSegmenter("zh-CN");
        }
        return self::$stochastic_term_segmenter;
    }
    /**
     * Determines the part of speech tag of a term using simple rules if
     * possible
     * @param string $term to see if can get a part of speech for via a rule
     * @return string part of speech tag or $term if can't be determine
     */
    public static function getPosKey($term)
    {
        if (self::isPunctuation($term)) {
            return 'PU';
        } else if (self::isCardinalNumber($term)) {
            return 'CD';
        } else if (self::isOrdinalNumber($term)) {
            return 'OD';
        } else if (self::isDate($term)) {
            return 'NT';
        } else if (self::isNotCurrentLang($term)) {
            return 'FW';
        }
        return $term;
    }
    /**
     * Possible tags a term can have that can be determined by a simple rule
     * @return array
     */
    public static function getPosKeyList()
    {
        return ['PU','CD','OD','NT','FW'];
    }
    /**
     * Return list of possible tags that an unknown term can have
     * @return array
     */
    public static function getPosUnknownTagsList()
    {
        return ["NN","NR","VV","VA"];
    }
    /**
     * Get the named entity tagger instance
     * @return NamedEntityContextTagger for Chinese
     */
    public static function getNamedEntityTagger()
    {
        if (!self::$named_entity_tagger) {
            self::$named_entity_tagger
                = new L\NamedEntityContextTagger("zh-CN");
        }
        return self::$named_entity_tagger;
    }
    /**
     * Get Part of Speec instance
     * @return PartOfSpeechContextTagger for Chinese
     */
    public static function getPosTagger()
    {
        if (!self::$pos_tagger) {
            self::$pos_tagger
                = new L\PartOfSpeechContextTagger("zh-CN");
        }
        return self::$pos_tagger;
    }
    /**
     * Converts traditional Chinese characters to simplified characters
     * @param  string $text is a string of Chinese Char
     * @return string normalized form of the text
     */
    public static function normalize($text)
    {
        if (empty(self::$traditional_simplified_map)) {
            $path = C\LOCALE_DIR .
                "/zh_CN/resources/traditional_simplified.txt.gz";
            if (!file_exists($path)) {
                return $text;
            }
            self::$traditional_simplified_map =
                unserialize(gzdecode(file_get_contents($path)));
        }
        if (is_string($text)) {
            if (!($chars = preg_split('//u', $text, -1,
                PREG_SPLIT_NO_EMPTY))) {
                return $text;
            }
        } else {
            return $text;
        }
        $num_chars = count($chars);
        for($i = 0; $i < $num_chars; $i++) {
            if (isset(self::$traditional_simplified_map[$chars[$i]])) {
                $chars[$i] = self::$traditional_simplified_map[$chars[$i]];
            }
        }
        return implode($chars);
    }
    /**
     * Scans a word list for phrases. For phrases found generate
     * a list of question and answer pairs at two levels of granularity:
     * CONCISE (using all terms in original phrase) and RAW (removing
     * (adjectives, etc).
     *
     * @param array $word_and_phrase_list of statements
     * @return array with two fields: QUESTION_LIST consisting of triplets
     *      (SUBJECT, PREDICATES, OBJECT) where one of the components has been
     *      replaced with a question marker.
     */
    public static function extractTripletsPhrases($word_and_phrase_list)
    {
        $triplets_list = [];
        $question_list = [];
        $question_answer_list = [];
        $triplet_types = ['CONCISE', 'RAW'];
        foreach ($word_and_phrase_list as $word_and_phrase => $position_list) {
            // strip parentheticals
            $word_and_phrase = preg_replace(
                "/[\{\[\(【(][^\}\])】\)]+[\}\]\))】]/u",
                "", $word_and_phrase);
            $tagged_phrase = self::tagTokenizePartOfSpeech($word_and_phrase);
            $parse_tree = ['cur_node' => 0];
            $extracted_triplets_set = [];
            $extracted_triplets=[];
            $pre_sub = [];
            while ($parse_tree['cur_node'] < count($tagged_phrase)) {
                $parse_tree = self::parseWholePhrase($tagged_phrase,
                ['cur_node'=>$parse_tree['cur_node']],$pre_sub);
                $triplets = self::extractTripletsParseTree($parse_tree);
                if (isset($parse_tree['NP'])) {
                    $pre_sub = $parse_tree['NP'];
                }
                $extracted_triplets_set[] = self::rearrangeTripletsByType(
                    $triplets);
                // next partial sentence
                while($parse_tree['cur_node'] < count($tagged_phrase)
                    && $tagged_phrase[$parse_tree['cur_node']]["tag"] != "PU") {
                    $parse_tree['cur_node']++;
                }
                $parse_tree['cur_node']++;
            }
            foreach($extracted_triplets_set as $extracted_triplets) {
                foreach ($triplet_types as $type) {
                    if (!empty($extracted_triplets[$type])) {
                        $triplets = $extracted_triplets[$type];
                        $questions = $triplets['QUESTION_LIST'];
                        foreach ($questions as $question) {
                            $question_list[$question] = $position_list;
                        }
                        $question_answer_list = array_merge(
                            $question_answer_list,
                            $triplets['QUESTION_ANSWER_LIST']);
                    }
                }
            }
        }
        $out_triplets['QUESTION_LIST'] = $question_list;
        $out_triplets['QUESTION_ANSWER_LIST'] = $question_answer_list;
        return $out_triplets;
    }
    /**
     * Split input text into terms and output an array with one element
     * per term, that element consisting of array with the term token
     * and the part of speech tag.
     *
     * @param string $text string to tag and tokenize
     * @return array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term) for one each token in $text
     */
    public static function tagTokenizePartOfSpeech($text)
    {
        $segmented = self::getStochasticTermSegmenter()->segmentSentence($text);
        $tags = self::getPosTagger()->predict($segmented);
        $result=[];
        for($i = 0; $i < count($segmented); $i++) {
            $result[$i] = [];
            $result[$i]["token"] = $segmented[$i];
            $result[$i]["tag"] = $tags[$i];
        }
        return $result;
    }
    /**
     * Starting at the $cur_node in a $tagged_phrase parse tree for an English
     * sentence, create a phrase string for each of the next nodes
     * which belong to part of speech group $type.
     *
     * @param array &$cur_node node within parse tree
     * @param array $tagged_phrase parse tree for phrase
     * @param string $type self::$noun_type, self::$verb_type, etc
     * @return string phrase string involving only terms of that $type
     */
    public static function parseTypeList(&$cur_node, $tagged_phrase, $type)
    {
        $string = "";
        $previous_string = "";
        $previous_tag = "";
        $start_node = $cur_node;
        $next_tag = (empty($tagged_phrase[$cur_node]['tag'])) ? "" :
            trim($tagged_phrase[$cur_node]['tag']);
        $allowed_conjuncts = [];
        while ($next_tag && (in_array($next_tag, $type) ||
            in_array($next_tag, $allowed_conjuncts))) {
            $previous_string = $string;
            $string .= $tagged_phrase[$cur_node]['token'];
            $cur_node++;
            $allowed_conjuncts = self::$conjunction_type;
            $previous_tag = $next_tag;
            $next_tag = (empty($tagged_phrase[$cur_node]['tag'])) ? "" :
                trim($tagged_phrase[$cur_node]['tag']);
        }
        if (in_array($previous_tag, $allowed_conjuncts) && $start_node <
            $cur_node) {
            $cur_node--;
            $string = $previous_string;
        }
        return $string;
    }
    /**
     * Takes a part-of-speech tagged phrase and pre-tree with a
     * parse-from position and builds a parse tree for an adjective if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["cur_node" =>
     *      current parse position in $tagged_phrase]
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     *      "JJ" a subarray with a token node for the adjective that was
     *      parsed
     */
    public static function parseAdjective($tagged_phrase, $tree)
    {
        $adjective_string = self::parseTypeList($tree['cur_node'],
            $tagged_phrase, self::$adjective_type);
        if (!empty($adjective_string)) {
            $tree["JJ"] = $adjective_string;
            $partical_string = self::parseTypeList($tree['cur_node'],
                $tagged_phrase,self::$particle_type);
            if ($partical_string) {
                $tree["PARTICAL"] = $partical_string;
            }
        }
        return $tree;
    }
    /**
     * Takes a part-of-speech tagged phrase and pre-tree with a
     * parse-from position and builds a parse tree for a determiner if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["curnode" =>
     *      current parse position in $tagged_phrase]
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     *      "DT" a subarray with a token node for the determiner that was
     *      parsed
     */
    public static function parseDeterminer($tagged_phrase, $tree)
    {
        $determiner_string = "";
        /* In: All the cows low, "All the" is considered a determiner.
           That is, we will mush together the predeterminer with the determiner
         */
        $determiner_string = self::parseTypeList($tree['cur_node'],
            $tagged_phrase, self::$determiner_type);
        if (!empty($determiner_string)) {
            $tree["DT"] = $determiner_string;
            $partical_string = self::parseTypeList($tree['cur_node'],
                $tagged_phrase,self::$particle_type);
            if ($partical_string) {
                $tree["PARTICAL"] = $partical_string;
            }
        }
        return $tree;
    }
    /**
     * Takes a part-of-speech tagged phrase and pre-tree with a
     * parse-from position and builds a parse tree for a noun if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["curnode" =>
     *      current parse position in $tagged_phrase]
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     *      "NN" a subarray with a token node for the noun string that was
     *      parsed
     */
    public static function parseNoun($tagged_phrase, $tree)
    {
        //Combining multiple noun into one
        $noun_string = self::parseTypeList($tree['cur_node'], $tagged_phrase,
            self::$noun_type);
        if (!empty($noun_string)) {
            $tree["NN"] = $noun_string;
            $partical_string = self::parseTypeList($tree['cur_node'],
                $tagged_phrase,self::$particle_type);
            if ($partical_string) {
                $tree["PARTICAL"] = $partical_string;
            }
        }
        return $tree;
    }
    /**
     * Takes a part-of-speech tagged phrase and pre-tree with a
     * parse-from position and builds a parse tree for a verb if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["curnode" =>
     *      current parse position in $tagged_phrase]
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     *      "VB" a subarray with a token node for the verb string that was
     *      parsed
     */
    public static function parseVerb($tagged_phrase, $tree)
    {
        $verb_string = self::parseTypeList($tree['cur_node'], $tagged_phrase,
            self::$verb_type);
        if (!empty($verb_string)) {
            $tree["VB"] = $verb_string;
            $partical_string = self::parseTypeList($tree['cur_node'],
                $tagged_phrase,self::$particle_type);
            if ($partical_string) {
                $tree["PARTICAL"] = $partical_string;
            }
        }
        return $tree;
    }
    /**
     * Takes a part-of-speech tagged phrase and pre-tree with a
     * parse-from position and builds a parse tree for a sequence of
     * prepositional phrases if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["cur_node" =>
     *      current parse position in $tagged_phrase]
     * @param int $index which term in $tagged_phrase to start to try to parse
     *      a preposition from
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     *      parsed followed by additional possible fields (here i
     *      represents the ith clause found):
     *      "IN_i" with value a preposition subtree
     *      "DT_i" with value a determiner subtree
     *      "JJ_i" with value an adjective subtree
     *      "NN_i"  with value an additional noun subtree
     */
    public static function parsePrepositionalPhrases($tagged_phrase, $tree,
        $index = 1)
    {
        $cur_node = $tree['cur_node'];
        /* There are two forms of preposition.
           The first one has lc only
           之前(lc) 他在看书 */
        if (isset($tagged_phrase[$cur_node]['tag']) &&
            trim($tagged_phrase[$cur_node]['tag']) == "LC") {
            $tree["LC"] = $tagged_phrase[$cur_node]['token'];
            $tree['cur_node']+=1;
            return $tree;
        }
        /* Second form:
          format: prep [anything] [locolizer|punctuation]
           在(p)今天早上,(pu) 他 在(p) 车 里(lc) 睡觉。
           In the morning today, he was sleeping in the car.
         */
        if (isset($tagged_phrase[$cur_node]['tag']) &&
            trim($tagged_phrase[$cur_node]['tag']) == "P") {
            /* can have multiple prep's in a row, for example,
               it is known in over 20 countries
              */
            $preposition_string = self::parseTypeList($cur_node, $tagged_phrase,
                ["P"]);
            if (!empty($preposition_string)) {
                $tree["P"] = $preposition_string;
            }
            while(isset($tagged_phrase[$cur_node]) &&
                isset($tagged_phrase[$cur_node]['tag']) &&
                !in_array($tagged_phrase[$cur_node]['tag'],["PU", "LC"])) {
                $tree["P"] .= $tagged_phrase[$cur_node]['token'];
                $cur_node++;
            }
            $lc_string = self::parseTypeList($cur_node, $tagged_phrase,
                ["LC"]);
            if ($lc_string) {
                $tree["P"] .= $lc_string;
            }
            $partical_string = self::parseTypeList($cur_node,
                $tagged_phrase,self::$particle_type);
            if ($partical_string) {
                $tree["PARTICAL"] = $partical_string;
            }
        }
        $tree['cur_node'] = $cur_node;
        return $tree;
    }
    /**
     * Takes a part-of-speech tagged phrase and pre-tree with a
     * parse-from position and builds a parse tree for a noun phrase if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["curnode" =>
     *      current parse position in $tagged_phrase]
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     *      "NP" a subarray with possible fields
     *      "DT" with value a determiner subtree
     *      "JJ" with value an adjective subtree
     *      "NN" with value a noun tree
     */
    public static function parseNounPhrase($tagged_phrase, $tree)
    {
        $cur_node = $tree['cur_node'];
        $tree_dt = self::parseDeterminer($tagged_phrase,
            ['cur_node' => $cur_node]);
        $tree_jj = self::parseAdjective($tagged_phrase,
            ['cur_node' => $tree_dt['cur_node']]);
        $tree_pp = self::parsePrepositionalPhrases($tagged_phrase,
            ['cur_node' => $tree_jj['cur_node']]);
        $tree_nn = self::parseNoun($tagged_phrase,
            ['cur_node' => $tree_pp['cur_node']]);
        if ($tree_nn['cur_node'] == $tree_pp['cur_node']) {
            $tree['NP'] = "";
            return $tree;
        }
        $tree_pp2 = self::parsePrepositionalPhrases($tagged_phrase,
            ['cur_node' => $tree_nn['cur_node']]);
        $cur_node = $tree_pp2['cur_node'];
        $cc = "";
        if (!empty($tagged_phrase[$cur_node]['tag']) &&
            in_array($tagged_phrase[$cur_node]['tag'],
            self::$conjunction_type)) {
            $cc = $tagged_phrase[$cur_node]['token'];
            $cur_node++;
        }
        $tree_np = self::parseNounPhrase($tagged_phrase,
            ['cur_node' => $cur_node]);
        if ($tree_np['cur_node'] == $cur_node && $cc) {
            $cur_node--;
            $tree_np = [];
            $cc = "";
        } else {
            $cur_node = $tree_np['cur_node'];
        }
        unset($tree_dt['cur_node'], $tree_jj['cur_node'],
            $tree_nn['cur_node'], $tree_pp['cur_node'],
            $tree_np['cur_node'], $tree_pp2['cur_node']);
        $sub_tree = ['DT' => $tree_dt, 'JJ' => $tree_jj, 'PRP' => $tree_pp,
        'NN' => $tree_nn, 'PRP2'=>$tree_pp2,'CC' => $cc,
            'ADD_NP' => $tree_np];
        return ['cur_node' => $cur_node, 'NP' => $sub_tree];
    }
    /**
     * Takes a part-of-speech tagged phrase and pre-tree with a
     * parse-from position and builds a parse tree for a verb phrase if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["curnode" =>
     *      current parse position in $tagged_phrase]
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     *      "VP" a subarray with possible fields
     *      "VB" with value a verb subtree
     *      "NP" with value an noun phrase subtree
     */
    public static function parseVerbPhrase($tagged_phrase, $tree)
    {
        $cur_node = $tree['cur_node'];
        $adverb_string="";
        do {
            $start_node=$cur_node;
            $adverb_string .= self::parseTypeList($cur_node, $tagged_phrase,
                self::$adverb_type);
            $adverb_string .= self::parseTypeList($cur_node, $tagged_phrase,
                self::$particle_type);
        } while($start_node!=$cur_node);
        if (!empty($adverb_string)) {
            $tree_vb["RB"] = $adverb_string;
        } else {
            $tree_vb=[];
        }
        $tree_vb = array_merge($tree_vb,
            self::parseVerb($tagged_phrase, ['cur_node' => $cur_node]));
        if ($cur_node == $tree_vb['cur_node']) {
            // if no verb return what started with
            return $tree;
        }
        $cur_node = $tree_vb['cur_node'];
        $tree_np = self::parseNounPhrase($tagged_phrase,
            ['cur_node' => $tree_vb['cur_node']]);
        $cur_node = $tree_np['cur_node'];
        if (!empty($tree_np['NP'])) {
            unset($tree_vb['cur_node'], $tree_np['cur_node']);
            return ['VP' => ['VB' => $tree_vb, 'NP' => $tree_np['NP']],
                'cur_node' => $cur_node];
        }
        unset($tree_vb['cur_node']);
        return ['VP' => ['VB' => $tree_vb], 'cur_node' => $cur_node];
    }
    /**
     * Given a part-of-speeech tagged phrase array generates a parse tree
     * for the phrase using a recursive descent parser.
     *
     * @param array $tagged_phrase an array of pairs of the form
     *  ("token" => token_for_term, "tag"=> part_of_speech_tag_for_term)
     * @param $tree that consists of ["curnode" =>
     *  current parse position in $tagged_phrase]
     * @param $tree_np_pre subject found from previous sub-sentence
     * @return array used to represent a tree. The array has up to three fields
     *  $tree["cur_node"] index of how far we parsed our$tagged_phrase
     *  $tree["NP"] contains a subtree for a noun phrase
     *  $tree["VP"] contains a subtree for a verb phrase
     */
    public static function parseWholePhrase($tagged_phrase, $tree,
        $tree_np_pre = [])
    {
        //remove heading adverbs
        $cur_node = $tree['cur_node'];
        do {
            $start_node = $cur_node;
            self::parseTypeList($cur_node, $tagged_phrase,
                self::$adverb_type);
            self::parseTypeList($cur_node, $tagged_phrase,
                self::$particle_type);
        } while ($start_node != $cur_node);
        $tree_np = self::parseNounPhrase($tagged_phrase,
            ["cur_node" => $cur_node]);
        if ($tree_np['cur_node'] == $cur_node) {
            if (!empty($tree_np_pre)) {
                $tree_np['NP'] = $tree_np_pre;
                $tree_np["cur_node"] = $cur_node;
            } else {
                return $tree;
            }
        }
        $tree_vp = self::parseVerbPhrase($tagged_phrase,
            ["cur_node" => $tree_np['cur_node']]);
        if ($tree_np['cur_node'] == $tree_vp['cur_node']) {
            return $tree;
        }
        $cur_node = $tree_vp['cur_node'];
        unset($tree_np['cur_node'], $tree_vp['cur_node']);
        if (!empty($tree_start) && !empty($tree_np['NP'])) {
            $tree_np['NP']['PRP-1'] = $tree_start;
        }
        return ['cur_node' => $cur_node, 'NP' => $tree_np['NP'],
            'VP' => $tree_vp['VP']];
    }
    /**
     * Takes a parse tree of a phrase and computes subject, predicate, and
     * object arrays. Each of these array consists of two components CONCISE and
     * RAW, CONCISE corresponding to something more similar to the words in the
     * original phrase and RAW to the case where extraneous words have been
     * removed
     *
     * @param are $tree a parse tree for a sentence
     * @return array triplet array
     */
    public static function extractTripletsParseTree($tree)
    {
        $triplets = [];
        $triplets['subject'] = self::extractSubjectParseTree($tree);
        $triplets['predicate'] = self::extractPredicateParseTree($tree);
        $triplets['object'] = self::extractObjectParseTree($tree);
        return $triplets;
    }
    /**
     * Takes phrase tree $tree and a part-of-speech $pos returns
     * the deepest $pos only path in tree.
     *
     * @param array $tree phrase to extract type from
     * @param string $pos the part of speech to extract
     * @return string the label of deepest $pos only path in $tree
     */
    public static function extractDeepestSpeechPartPhrase($tree, $pos)
    {
        $extract = "";
        if (!empty($tree[$pos])) {
            $extract = self::extractDeepestSpeechPartPhrase($tree[$pos], $pos);
        }
        if (!$extract && !empty($tree[$pos]) && !empty($tree[$pos][$pos])) {
            $extract = $tree[$pos][$pos];
        }
        return $extract;
    }
    /**
     * Takes a parse tree of a phrase or statement and returns an array
     * with two fields CONCISE and RAW the former having the object of
     * the original phrase (as a string) the latter having the importart
     * parts of the object
     *
     * @param array representation of a parse tree of a phrase
     * @return array with two fields CONCISE and RAW as described above
     */
    public static function extractObjectParseTree($tree)
    {
        $object = [];
        if (!empty($tree['VP'])) {
            $tree_vp = $tree['VP'];
            if (!empty($tree_vp['NP'])) {
                $nb = $tree_vp['NP'];
                $object['CONCISE'] = $tree_vp['NP'];
                while (isset($object['CONCISE']["ADD_NP"]["NP"]) &&
                    is_array($object['CONCISE']["ADD_NP"]["NP"])) {
                    $object['CONCISE'] = $object['CONCISE']["ADD_NP"]["NP"];
                }
                $object['CONCISE'] = $object['CONCISE']["NN"]["NN"] ??
                    "";
                $raw_object = "";
                $it = new \RecursiveIteratorIterator(
                    new \RecursiveArrayIterator($nb));
                foreach ($it as $v) {
                    $raw_object .= $v;
                }
                $object['RAW'] = $raw_object;
            } else {
                $object['CONCISE'] = "";
                $object['RAW'] = "";
            }
        } else {
            $object['CONCISE'] = "";
            $object['RAW'] = "";
        }
        return $object;
    }
    /**
     * Takes a parse tree of a phrase or statement and returns an array
     * with two fields CONCISE and RAW the former having the predicate of
     * the original phrase (as a string) the latter having the importart
     * parts of the predicate
     *
     * @param array representation of a parse tree of a phrase
     * @return array with two fields CONCISE and RAW as described above
     */
    public static function extractPredicateParseTree($tree)
    {
        $predicate = [];
        if (!empty($tree['VP'])) {
            $tree_vp = $tree['VP'];
            $predicate['CONCISE'] = self::extractDeepestSpeechPartPhrase(
                $tree_vp, "VB");
            $raw_predicate = "";
            if (!empty($tree_vp['VB'])) {
                $tree_vb = $tree_vp['VB'];
                $it = new \RecursiveIteratorIterator(
                    new \RecursiveArrayIterator($tree_vb));
                foreach ($it as $v) {
                    $raw_predicate .= $v;
                }
                $predicate['RAW'] = $raw_predicate;
            }
        } else {
            $predicate['CONCISE'] = "";
            $predicate['RAW'] = "";
        }
        return $predicate;
    }
    /**
     * Takes a parse tree of a phrase or statement and returns an array
     * with two fields CONCISE and RAW the former having the subject of
     * the original phrase (as a string) the latter having the importart
     * parts of the subject
     *
     * @param array representation of a parse tree of a phrase
     * @return array with two fields CONCISE and RAW as described above
     */
    public static function extractSubjectParseTree($tree)
    {
        $subject = [];
        if (!empty($tree['NP'])) {
            $subject['CONCISE'] = $tree['NP'];
            while (isset($subject['CONCISE']["ADD_NP"]["NP"]) &&
                is_array($subject['CONCISE']["ADD_NP"]["NP"])) {
                $subject['CONCISE'] = $subject['CONCISE']["ADD_NP"]["NP"];
            }
            $subject['CONCISE'] = $subject['CONCISE']["NN"]["NN"] ?? "";
            $raw_subject = "";
            $it = new \RecursiveIteratorIterator(
                new \RecursiveArrayIterator($tree['NP']));
            foreach ($it as $v) {
                $raw_subject .= $v ;
            }
            $subject['RAW'] = $raw_subject;
        } else {
            $subject['CONCISE'] = "";
            $subject['RAW'] = "";
        }
        return $subject;
    }
    /**
     * Takes a triplets array with subject, predicate, object fields with
     * CONCISE and RAW subfields and rearranges it to have two fields CONCISE
     * and RAW with subject, predicate, object, and QUESTION_ANSWER_LIST
     * subfields
     *
     * @param array $sub_pred_obj_triplets in format described above
     * @return array $processed_triplets in format described above
     */
    public static function rearrangeTripletsByType($sub_pred_obj_triplets)
    {
        $processed_triplet = [];
        $processed_triplets['CONCISE'] =
            self::extractTripletByType($sub_pred_obj_triplets, "CONCISE");
        $processed_triplets['RAW'] =
            self::extractTripletByType($sub_pred_obj_triplets, "RAW");
        return $processed_triplets;
    }
    /**
     * Takes a triplets array with subject, predicate, object fields with
     * CONCISE, RAW subfields and produces a triplits with $type subfield (where
     * $type is one of CONCISE and RAW) and with subject, predicate, object,
     * and QUESTION_ANSWER_LIST subfields
     *
     * @param array $sub_pred_obj_triplets  in format described above
     * @param string $type either CONCISE or RAW
     * @return array $triplets in format described above
     */
    public static function extractTripletByType($sub_pred_obj_triplets, $type)
    {
        $triplets = [];
        if (!empty($sub_pred_obj_triplets['subject'][$type])
            && !empty($sub_pred_obj_triplets['predicate'][$type])
            && !empty($sub_pred_obj_triplets['object'][$type])) {
            $question_answer_triplets = [];
            $sentence = [ trim($sub_pred_obj_triplets['subject'][$type]),
                trim($sub_pred_obj_triplets['predicate'][$type]),
                trim($sub_pred_obj_triplets['object'][$type])];
            $question_triplets = [];
            for ($j = 0; $j < 2; $j++) {
                if ($j == 1 && in_array($sentence[1], ['是'])) {
                    $tmp = $sentence[2];
                    $sentence[2] = $sentence[0];
                    $sentence[0] = $tmp;
                } else if ($j == 1) {
                    break;
                }
                for ($i = 0; $i < 3; $i++) {
                    $q_sentence = $sentence;
                    $q_sentence[$i] = self::$question_token;
                    $q_sentence_string = implode(" ", $q_sentence);
                    $question_triplets[] = $q_sentence_string;
                    $question_answer_triplets[$q_sentence_string] =
                        preg_replace('/\s+/u', ' ',$sentence[$i]);
                }
            }
            $triplets['QUESTION_LIST'] = $question_triplets;
            $triplets['QUESTION_ANSWER_LIST'] = $question_answer_triplets;
        }
        return $triplets;
    }
    /**
     * Takes any question started with WH question and returns the
     * triplet from the question
     *
     * @param string $question question to parse
     * @return array question triplet
     */
    public static function questionParser($question)
    {
        $tagged_question = self::tagTokenizePartOfSpeech($question);
        $generated_questions = [];
        $keywords = self::isQuestion($question);
        if ($keywords) {
            $generated_questions=self::parseQuestion(
                $tagged_question, 1, $keywords);
        }
        return $generated_questions;
    }
    /**
     * Takes a phrase query entered by user and return true if it is question
     * and false if not
     *
     * @param $phrase any statement
     * @return bool returns question word if statement is question
     */
    public static function isQuestion($phrase)
    {
        $terms=self::getStochasticTermSegmenter()->segmentSentence($phrase);
        $qt=self::questionType($terms, self::$question_words);
        if (in_array($qt["types"], ["who","what","which","where","when",
            "whose", "how", "how many", "how long", "how big"])) {
            return $qt["ques_words"];
        }
        return false;
    }
    /**
     * Takes tagged question string starts with Who
     * and returns question triplet from the question string
     *
     * @param string $tagged_question part-of-speech tagged question
     * @param int $index current index in statement
     * @param string $question_word is the question word need to be replaced
     * @return array parsed triplet
     */
    public static function parseQuestion($tagged_question, $index,
        $question_word)
    {
        $generated_questions = [];
        $tree = ["cur_node" => 0];
        $parse_tree = self::parseWholePhrase($tagged_question, $tree);
        $triplets = self::extractTripletsParseTree($parse_tree);
        $triplet_types = ['CONCISE', 'RAW'];
        foreach ($triplet_types as $type) {
            if (!empty($triplets['subject'][$type])
                && !empty($triplets['predicate'][$type])
                && !empty($triplets['object'][$type])) {
                $sub = trim($triplets['subject'][$type]);
                $sub = preg_replace("/^.*".$question_word.".*$/u",
                    self::$question_token, $sub);
                $pre = trim($triplets['predicate'][$type]);
                $pre = preg_replace("/^.*".$question_word.".*$/u",
                    self::$question_token, $pre);
                $obj = trim($triplets['object'][$type]);
                $obj = preg_replace("/^.*".$question_word.".*$/u",
                    self::$question_token, $obj);
                $generated_questions[$type][] = $obj . " " . $pre . " " . $sub;
                $generated_questions[$type][] = $sub . " " . $pre . " " . $obj;
            }
        }
        return $generated_questions;
    }
    /**
     * Helper function for isQuestion
     * @param $term_array segmented Chinese terms
     * @param $type_list currect trace of self::$question_words
     * return ["ques_words"=>ques_words,"types"=>types]
     */
    public static function questionType($term_array, $type_list)
    {
        if (!isset($type_list["any"])) {
            return ["ques_words"=>"","types"=>""];
        }
        $types = "";
        $ques_words = "";
        for($i = 0; $i < count($term_array); $i++ ) {
            foreach($type_list["any"] as $key => $value) {
                if (preg_match('/^('.$key.')$/u',$term_array[$i])) {
                    if (is_array($value)) {
                        if(isset($value["after"])) {
                            $found_after = false;
                            if (array_key_exists($i+1,$term_array)) {
                                foreach($value["after"] as $key2 => $value2) {
                                    if (preg_match('/^(' . $key2 . ')$/u',
                                        $term_array[$i + 1])) {
                                        $ques_words = $term_array[$i].
                                            " " . $term_array[$i + 1];
                                        $types = $value2;
                                        $found_after = true;
                                        break;
                                    }
                                }
                            }
                            if (!$found_after && isset($type_list["other"]) &&
                                $value["other"]) {
                                $ques_words = $term_array[$i];
                                $types = $value["other"];
                            }
                        } elseif (isset($value["any"])) {
                            $t = self::questionType($term_array,$value);
                            $ques_words[] = $term_array[$i];
                            $types = $t["types"];
                        }
                    } elseif ($value) {
                        $ques_words = $term_array[$i];
                        $types = $value;
                    }
                }
            }
        }
        if ($types == "" && isset($type_list["other"])) {
            if (is_array($type_list["other"])) {
                $t = self::questionType($term_array, $type_list["other"]);
                $ques_words = $t["ques_words"];
                $types = $t["types"];
            } elseif ($type_list["other"]) {
                $types = $type_list["other"];
            }
        }
        return ["ques_words" => $ques_words, "types" => $types];
    }
}
ViewGit