Last commit for src/locale/hi/resources/Tokenizer.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]

Adjust copyrights years

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2018  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * @author Chris Pollett chris@pollett.org
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2018
 * @filesource
 */
namespace seekquarry\yioop\locale\hi\resources;

use seekquarry\yioop\models as M;
/**
 * Hindi specific tokenization code. In particular, it has a stemmer,
 * The stemmer is my stab at porting Ljiljana Dolamic (University of Neuchatel,
 * www.unine.ch/info/clef/) Java stemming algorithm:
 * http://members.unine.ch/jacques.savoy/clef/HindiStemmerLight.java.txt
 * Here given a word, its stem is that part of the word that
 * is common to all its inflected variants. For example,
 * tall is common to tall, taller, tallest. A stemmer takes
 * a word and tries to produce its stem.
 *
 * @author Chris Pollett
 */
class Tokenizer
{
    /**
     * List of verb-like parts of speech that might appear in lexicon
     * @var array
     */
    public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ",
        "RB"];
    /**
     * List of noun-like parts of speech that might appear in lexicon
     * @var array
     */
    public static $noun_phrases = ["NN", "NNS", "NNP", "NNPS", "DT"];
    /**
     * List of adjective-like parts of speech that might appear in lexicon
     * @var array
     */
    public static $adjective_phrases = ["JJ", "JJR", "JJS"];
    /**
     * List of postpositional-like parts of speech that might appear in lexicon
     * @var array
     */
    public static $postpositional_phrases = ["IN", "inj", "PREP", "proNN",
        "CONJ", "INT", "particle", "case", "PSP", "direct_DT", "PRP"];
    /**
     * List of questions in Hindi
     * @var array
     */
    public static $questions = ["क्या", "कब", "कहा", "क्यों", "कौन", "जिसे",
            "जिसका", "कहाँ", "कहां"];
    /**
     * Any unique identifier corresponding to the component of a triplet which
     * can be answered using a question answer list
     * @var string
     */
    public static $question_marker = "qqq";
    /**
     * Words we don't want to be stemmed
     * @var array
     */
    public static $no_stem_list = [];
    /**
     * Stub function which could be used for a word segmenter.
     * Such a segmenter on input thisisabunchofwords would output
     * this is a bunch of words
     *
     * @param string $pre_segment  before segmentation
     * @return string should return string with words separated by space
     *     in this case does nothing
     */
    public static function segment($pre_segment)
    {
        return $pre_segment;
    }
    /**
     * Computes the stem of an Hindi word
     *
     * @param string $word the string to stem
     * @return string the stem of $word
     */
    public static function stem($word)
    {
        return $word;
    }
    /**
     * Removes common Hindi suffixes
     *
     * @param string $word to remove suffixes from
     * @return string result of suffix removal
     */
    private static function removeSuffix($word)
    {
        return $word;
    }
    /**
     * The method takes as input a phrase and returns a string with each
     * term tagged with a part of speech.
     *
     * @param string $phrase text to add parts speech tags to
     * @param bool $with_tokens whether to include the terms and the tags
     *      in the output string or just the part of speech tags
     * @return string $tagged_phrase which is a string of format term~pos
     */
    public static function tagPartsOfSpeechPhrase($phrase, $with_tokens = true)
    {
        $tagged_tokens = self::tagTokenizePartOfSpeech($phrase);
        $tagged_phrase  = self::taggedPartOfSpeechTokensToString(
            $tagged_tokens, $with_tokens);
        return $tagged_phrase;
    }
    /**
     * Uses the lexicon to assign a tag to each token and then uses a rule
     * based approach to assign the most likely of tags to each token
     *
     * @param string $text input phrase which is to be tagged
     * @return string $result which is an array of token => tag
     */
    public static function tagTokenizePartofSpeech($text)
    {
        $tokens = preg_split("/\s+/u", $text);
        $result = [];
        $tag_list = [];
        $i = 0;
        $model = new M\Model();
        foreach ($tokens as $token)
        {
            //Tag the tokens as found in the Lexicon
            $token = trim($token);
            $current = ["token" => $token, "tag" => "UNKNOWN"];
            $term = $current["token"];
            $sql = "SELECT PART_OF_SPEECH FROM LEXICON WHERE TERM = '{$term}'
             AND LOCALE = 'hi'";
            $queryResult = @$model->db->execute($sql);
            if ($queryResult !== false) {
                $row = $model->db->fetchArray($queryResult);
                $current["tag"] = $row["PART_OF_SPEECH"];
            }

            if (is_numeric($token)) {
                $current["tag"] = "NN";
            } else if (strcmp($token,"है") == 0 ||
                        strcmp($token, "हैं") == 0) {
                $current["tag"] = "VB";
            }

            if (!isset($current["tag"])) {
                $current["tag"] = "UNKNOWN";
            }

            $result[$i] = $current;
            $i++;
        }
        return self::tagUnknownWords($result);
    }
    /**
     * This method tags the remaining words in a partially tagged text array.
     *
     * @param array $partially_tagged_text term array representing a text
     *      passage. Each element in array is in turnan associative array
     *      [token => token_value, tag => tag_value (may be empty)]
     * @return array text passage array where all empty tags now have values
     */
    public static function tagUnknownWords($partially_tagged_text)
    {
        $result = $partially_tagged_text;
        $verbs = ["VBZ","VBD","VBN"];
        $length = count($result);
        $previous = $result[0];
        for ($i = 1; $i < $length; $i++)
        {
            $current = $result[$i];
            $current["token"] = trim($current["token"]);
            $current["tag"] = trim($current["tag"]);
            if ($current["tag"] == "UNKNOWN" || $previous["tag"] == "UNKNOWN") {
                /**
                 * RULE 1: If the previous word tagged is a Adjective Pronoun
                 * Postposition then the current word is likely to be a noun
                 */
                if ($previous["tag"] == "JJ"     ||
                    $previous["tag"] == "PRO_NN" ||
                    $previous["tag"] == "POST_POS") {
                    $current["tag"] = "NN";
                    $result[$i] = $current;
                }
                /**
                 * RULE 2: If the current word is a verb then the previous
                 * word is likely to be a noun
                 */
                if (in_array($current["tag"], $verbs)) {
                    $previous["tag"] = "NN";
                    $result[$i-1] = $previous;
                }
                /**
                 * PRONOUN IDENTIFICATION
                 * RULE 3: If the previous word is unknown and cuurent word
                 * is a noun then the previous word is most likely to be a
                 * pronoun
                 */
                if ($previous["tag"] == "UNKNOWN" &&
                    $current["tag"] == "NN") {
                    $previous["tag"] = "PRP";
                    $result[$i-1] = $previous;
                }
                /**
                 * VERB IDENTIFICATION
                 * RULE 4: If the current word is tagged as Auxilary verb and
                 * previous word is tagged as Unknown then most likely that
                 * the previous word is a verb
                 */
                if ($current["tag"] == "VAUX" &&
                    $previous["tag"] == "UNKNOWN") {
                    $previous["tag"] = "VB";
                    $result[$i-1] = $previous;
                }
                /**
                 * ADJECTIVE IDENTIFIATION
                 * RULE 5: if the currennt word ends with "तम" or "इक" or "िक"
                 * or "तर" then the word is an adjective
                 */
                if(mb_substr($current["token"], -2, 2) == "इक" ||
                    mb_substr($current["token"], -2, 2) == "िक" ||
                    mb_substr($current["token"], -2, 2) == "तर"  ||
                    mb_substr($current["token"], -2, 2) == "तम") {
                    $current["tag"] = "JJ";
                    $result[$i] = $current;
                }
                if ($current["tag"] == "UNKNOWN") {
                    $current["tag"] = "NN";
                    $result[$i] = $current;
                }
                if ($previous["tag"] == "UNKNOWN"){
                    $previous["tag"] = "NN";
                    $result[$i-1] = $previous;
                }
            }
            $previous = $current;
        }
        return $result;
     }
    /**
     * This method is used to simplify the different tags of speech to a
     * common form
     *
     * @param array $tagged_tokens which is an array of tokens assigned tags.
     * @param bool $with_tokens whether to include the terms and the tags
     *      in the output string or just the part of speech tags
     * @return string $tagged_phrase which is a string fo form token~pos
     */
    public static function taggedPartOfSpeechTokensToString($tagged_tokens,
        $with_tokens = true)
    {
        $tagged_phrase = "";
        $with_tokens = $with_tokens;
        $simplified_parts_of_speech = [
          "NNS" => "NN", "NNP" => "NN", "NNPS" => "NN","WP" => "NN",
          "VB" => "VB", "VBD" => "VB", "VBN" => "VB", "VBP" => "VB",
          "VBZ" => "VB",
          "JJ" => "AJ", "JJR" => "AJ", "JJS" => "AJ",
          "RB" => "AV", "RBR" => "AV", "RBS" => "AV", "WRB" => "AV",
          "inj" => "IN", "case" => "IN", "proNN" => "IN", "particle" => "IN",
          "PREP" => "IN", "IN" => "IN", "PSP" => "IN",
          "direct_DT" => "DT",
       ];
        foreach ($tagged_tokens as $t) {
            $tag = trim($t["tag"]);
            $tag = (isset($simplified_parts_of_speech[$tag])) ?
                   $simplified_parts_of_speech[$tag] : $tag;
            $token = ($with_tokens) ? $t["token"] . "~" : "";
            $tagged_phrase .= $token . $tag .  " ";
        }
        return $tagged_phrase;
    }
    /**
     * Takes a part-of-speech tagged phrase and pre-tree with a
     * parse-from position and builds a parse tree for a noun if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag" => part_of_speech_tag_for_term)
     * @param array $tree that consists of ["curnode" =>
     *      current parse position in $tagged_phrase]
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     *      "NN" a subarray with a token node for the noun string that was
     *      parsed
     */
    public static function extractNoun($tagged_phrase, $tree)
    {
        //Combining multiple noun into one
        $noun_string = "";
        $cur_node = $tree["cur_node"];
        while (isset($tagged_phrase[$cur_node]["tag"]) &&
            (in_array(trim($tagged_phrase[$cur_node]["tag"]),
            self::$noun_phrases))) {
            $noun_string .= " " . $tagged_phrase[$cur_node]["token"];
            $cur_node++;
        }
        if (!empty($noun_string)) {
            $tree["NN"] = $noun_string;
        }
        $tree["cur_node"] = $cur_node;
        return $tree;
    }
    /**
     * Takes a part-of-speech tagged phrase and parse-tree with a
     * parse-from position and builds a parse tree for a sequence of
     * postpositional phrases if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["cur_node" =>
     *      current parse position in $tagged_phrase]
     * @param int $index position in array to start from
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     */
    public static function extractPostpositionPhrase($tagged_phrase, $tree,
        $index = 1)
    {
        $cur_node = $tree["cur_node"];
        $tree_pp["cur_node"] = $tree["cur_node"];
        if (isset ($tagged_phrase[$cur_node]["tag"]) &&
            in_array($tagged_phrase[$cur_node]["tag"],
            self::$postpositional_phrases)) {
            $pp_string ="";
            while (isset($tagged_phrase[$cur_node]["tag"]) &&
                in_array($tagged_phrase[$cur_node]["tag"],
                self::$postpositional_phrases)) {
                $pp_string .= " " . $tagged_phrase[$cur_node]["token"];
                $cur_node++;
            }
            if (!empty($pp_string)) {
                $tree_pp["IN_$index"] = $pp_string;
            }
            $adjective_string = "";
            while (isset($tagged_phrase[$cur_node]["tag"]) &&
                in_array($tagged_phrase[$cur_node]["tag"],
                    self::$adjective_phrases)) {
                $adjective_string .= " " .
                    $tagged_phrase[$cur_node]["token"];
                $cur_node++;
            }
            if (!empty($adjective_string)) {
                $tree_pp["JJ_$index"] = $adjective_string;
            }
            $nn_string = "";
            while (isset($tagged_phrase[$cur_node]["tag"]) &&
                in_array($tagged_phrase[$cur_node]["tag"],
                    self::$noun_phrases)) {
                $nn_string .= " " . $tagged_phrase[$cur_node]["token"];
                $cur_node++;
            }
            if (!empty($nn_string)) {
                $tree_pp["NN_$index"] = $nn_string;
            }
            $tree_pp["cur_node"] = $cur_node;
            $tree_next = self::extractPostpositionPhrase($tagged_phrase,
                $tree_pp, $index + 1);
            $tree_pp = array_merge ($tree_pp, $tree_next);
        }
        $tree["cur_node"] = $tree_pp["cur_node"];
        unset ($tree_pp["cur_node"]);
        $tree["POST"] = $tree_pp;
        return $tree;
    }
    /**
     * Takes a part-of-speech tagged phrase and parse-tree with a
     * parse-from position and builds a parse tree for a noun phrase if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["curnode" =>
     *      current parse position in $tagged_phrase]
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     *      "JJ" with value an Adjective subtree
     *      "NN" with value of a Noun Subtree
     */
    public static function extractNounPhrase($tagged_phrase, $tree)
    {
        $cur_node = $tree["cur_node"];
        $tree_jj = self::extractAdjective($tagged_phrase,
            ["cur_node" => $tree["cur_node"]]);
        $tree_nn = self::extractNoun($tagged_phrase,
            ["cur_node" => $tree_jj["cur_node"]]);
        if ($tree_nn["cur_node"] == $cur_node) {
            $tree["NP"] = "";
        } else {
            $cur_node = $tree_nn["cur_node"];
            unset($tree_jj["cur_node"]);
            $tree_new_sub["JJ"] = $tree_jj;
            unset($tree_nn["cur_node"]);
            $tree_new_sub["NN"] = $tree_nn;
            $tree_new["cur_node"] = $cur_node;
            $tree_new["NP"] = $tree_new_sub;
            return $tree_new;
        }
        return $tree;
    }
    /**
     * Takes a part-of-speech tagged phrase and pre-tree with a
     * parse-from position and builds a parse tree for a verb if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["curnode" =>
     *      current parse position in $tagged_phrase]
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     *      "VB" a subarray with a token node for the verb string that was
     *      parsed
     */
    public static function extractVerb($tagged_phrase, $tree)
    {
        $cur_node = $tree["cur_node"];
        $verb_string = "";
        while (isset($tagged_phrase[$cur_node]["tag"]) &&
            in_array(trim($tagged_phrase[$cur_node]["tag"]),
            self::$verb_phrases)) {
            $verb_string .= " " . $tagged_phrase[$cur_node]["token"];
            $cur_node++;
        }
        if (!empty($verb_string)) {
            $tree["VB"] = $verb_string;
        }
        $tree["cur_node"] = $cur_node;
        return $tree;
    }
    /**
     * Takes a part-of-speech tagged phrase and pre-tree with a
     * parse-from position and builds a parse tree for a verb phrase if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["curnode" =>
     *      current parse position in $tagged_phrase]
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     *      "VP" a subarray with possible fields
     *      "VB" with value a verb subtree
     */
    public static function extractVerbPhrase($tagged_phrase, $tree)
    {
        $cur_node = $tree["cur_node"];
        $tree_vb = self::extractVerb($tagged_phrase, ["cur_node" => $cur_node]);
        if ($tree_vb["cur_node"] == $cur_node) {
            $tree["VP"] = [];
            return $tree;
        }
        $cur_node = $tree_vb["cur_node"];
        $postposition_string = "";
        while (isset($tagged_phrase[$cur_node]["tag"]) &&
            in_array(trim($tagged_phrase[$cur_node]["tag"]),
                self::$postpositional_phrases)) {
            $postposition_string .= " ". $tagged_phrase[$cur_node]["token"];
            $cur_node++;
        }
        if (!empty($postposition_string)) {
            $tree_vb["IN"] = $postposition_string;
        }
        $tree_np = self::extractNounPhrase($tagged_phrase,
            ["cur_node" => $cur_node]);
        $tree_new = [];
        $tree_new_sub = [];
        if ($tree_np["cur_node"] !=  $cur_node) {
            $cur_node = $tree_np["cur_node"];
            unset($tree_vb["cur_node"], $tree_np["cur_node"]);
            $tree_new_sub["VB"] = $tree_vb;
            $tree_new_sub["NP"] = $tree_np["NP"];
            $tree_new["cur_node"] = $cur_node;
            $tree_new["VP"] = $tree_new_sub;
            return $tree_new;
        }
        unset($tree_vb["cur_node"]);
        $tree_new_sub["VB"] = $tree_vb;
        $tree_new["cur_node"] = $cur_node;
        $tree_new["VP"] = $tree_new_sub;
        return $tree_new;
    }
    /**
     * Takes a part-of-speech tagged phrase and pre-tree with a
     * parse-from position and builds a parse tree for an adjective if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["cur_node" =>
     *      current parse position in $tagged_phrase]
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     *      "JJ" a subarray with a token node for the adjective that was
     *      parsed
     */
    public static function extractAdjective($tagged_phrase, $tree)
    {
        $adjective_string = "";
        $cur_node = $tree["cur_node"];
        while (isset($tagged_phrase[$cur_node]["tag"]) &&
            in_array(trim($tagged_phrase[$cur_node]["tag"]),
            self::$adjective_phrases)) {
            $adjective_string .= " " . $tagged_phrase[$cur_node]["token"];
            $cur_node++;
        }
        if (!empty($adjective_string)) {
            $tree["JJ"] = $adjective_string;
        }
        $tree["cur_node"] = $cur_node;
        return $tree;
    }
    /**
     * Given a part-of-speeech tagged phrase array generates a parse tree
     * for the phrase using a recursive descent parser.
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @return array used to represent a tree. The array has up to three fields
     *      $tree["cur_node"] index of how far we parsed our$tagged_phrase
     *      $tree["NP"] contains a subtree for a subject phrase
     *      $tree["POST"] contains a subtree for a object phrase
     *      $tree["VP"] contains a subtree for a predicate phrase
     */
    public static function generatePhraseParseTree($tagged_phrase)
    {
        $tree = [];
        $tree_np = self::extractNounPhrase($tagged_phrase,["cur_node" => 0]);
        $tree = ["cur_node" => $tree_np["cur_node"]];
        $tree_pp = self::extractPostpositionPhrase($tagged_phrase, $tree);
        $tree["cur_node"] = $tree_pp["cur_node"];
        $tree_vp = self::extractVerbPhrase($tagged_phrase, $tree);
        $tree["cur_node"] = $tree_vp["cur_node"];
        unset($tree_np["cur_node"], $tree_pp["cur_node"], $tree_vp["cur_node"]);
        $tree["NP"] = $tree_np["NP"];
        $tree["POST"] = $tree_pp["POST"];
        $tree["VP"] = $tree_vp["VP"];
        return $tree;
    }
    /**
     * Scans a word list for phrases. For phrases found generate
     * a list of question and answer pairs at two levels of granularity:
     * CONCISE (using all terms in orginal phrase) and RAW (removing
     * (adjectives, etc).
     *
     * @param array $word_and_phrase_list of statements
     * @return array with two fields: QUESTION_LIST consisting of
     *      (SUBJECT, COMPLEMENT) where one of the components has been
     *      replaced with a question marker.
     */
    public static function extractTripletsPhrases($word_and_phrase_list)
    {
        $triplets_list = [];
        $question_list = [];
        $question_answer_list = [];
        $triplet_types = ["CONCISE", "RAW"];
        foreach ($word_and_phrase_list as $word_and_phrase => $position_list) {
            $sentence = $word_and_phrase;
            $sentence = preg_replace("/\s+/u", " ", $word_and_phrase);
            $sentence = trim($sentence);
            $tagged_phrase = self::tagTokenizePartOfSpeech($sentence);
            $parse_tree = self::generatePhraseParseTree($tagged_phrase);
            $triplets = self::extractTripletsParseTree($parse_tree);
            $extracted_triplets = self::rearrangeTripletsByType($triplets);
            foreach ($triplet_types as $type) {
                if (!empty($extracted_triplets[$type])) {
                    $triplets = $extracted_triplets[$type];
                    $questions = $triplets["QUESTION_LIST"];
                    foreach ($questions as $question) {
                        $question_list[$question] = $position_list;
                    }
                    $question_answer_list = array_merge($question_answer_list,
                        $triplets["QUESTION_ANSWER_LIST"]);
                }
            }
        }
        $out_triplets["QUESTION_LIST"] = $question_list;
        $out_triplets["QUESTION_ANSWER_LIST"] = $question_answer_list;
        return $out_triplets;
    }
    /**
     * Takes phrase tree $tree and a part-of-speech $pos returns
     * the deepest $pos only path in tree.
     *
     * @param array $tree phrase to extract type from
     * @param string $pos the part of speech to extract
     * @return string the label of deepest $pos only path in $tree
     */
    public static function extractDeepestSpeechPartPhrase($tree, $pos)
    {
        $extract = "";
        if (!empty($tree[$pos])) {
            $extract = self::extractDeepestSpeechPartPhrase($tree[$pos], $pos);
        }
        if (!$extract && !empty($tree[$pos]) && !empty($tree[$pos][$pos])) {
            $extract = $tree[$pos][$pos];
        }
        return $extract;
    }
    /**
     * Takes a parse tree of a phrase or statement and returns an array
     * with two fields CONCISE and RAW the former having the subject of
     * the original phrase (as a string) the latter having the importart
     * parts of the subject
     *
     * @param array representation of a parse tree of a phrase
     * @return array with two fields CONCISE and RAW as described above
     */
    public static function extractSubjectParseTree($tree)
    {
        $subject = [];
        if (!empty($tree["NP"])) {
            $subject["CONCISE"] = self::extractDeepestSpeechPartPhrase(
                $tree["NP"], "NN");
            $raw_subject = "";
            $it = new \RecursiveIteratorIterator(
                new \RecursiveArrayIterator($tree["NP"]));
            foreach ($it as $v) {
                $raw_subject .= $v . " ";
            }
            $subject["RAW"]= $raw_subject;
        } else {
            $subject["CONCISE"] = "";
            $subject["RAW"] = "";
        }
        return $subject;
    }
    /**
     * Takes a parse tree of a phrase or statement and returns an array
     * with two fields CONCISE and RAW the former having the predicate of
     * the original phrase (as a string) the latter having the importart
     * parts of the predicate
     *
     * @param array representation of a parse tree of a phrase
     * @return array with two fields CONCISE and RAW as described above
     */
    public static function extractPredicateParseTree($tree)
    {
        $predicate = [];
        if (!empty($tree["VP"])) {
            $tree_vp = $tree["VP"];
            $predicate["CONCISE"] = self::extractDeepestSpeechPartPhrase(
                $tree_vp, "VB");
            $raw_predicate = "";
            if (!empty($tree_vp["VB"])) {
                $tree_vb = $tree_vp["VB"];
                $it = new \RecursiveIteratorIterator(
                    new \RecursiveArrayIterator($tree_vb));
                foreach ($it as $v) {
                    $raw_predicate .= $v . " ";
                }
                $predicate["RAW"] = $raw_predicate;
            }
        } else {
            $predicate["CONCISE"] = "";
            $predicate["RAW"] = "";
        }
        return $predicate;
    }
    /**
     * Takes a parse tree of a phrase or statement and returns an array
     * with two fields CONCISE and RAW the former having the object of
     * the original phrase (as a string) the latter having the importart
     * parts of the object
     *
     * @param array representation of a parse tree of a phrase
     * @return array with two fields CONCISE and RAW as described above
     */
    public static function extractObjectParseTree($tree)
    {
        $object = [];
        if (!empty($tree["POST"])) {
            $tree_pp = $tree["POST"];
            if (!empty($tree_pp["NP"])) {
                $np = $tree_pp["NP"];
                $object["CONCISE"] = self::extractDeepestSpeechPartPhrase($np,
                    "NN");
            } else {
                $object["CONCISE"] = "";
            }
            $raw_object = "";
            $it = new \RecursiveIteratorIterator(
                new \RecursiveArrayIterator($tree_pp));
            foreach ($it as $v) {
                $raw_object .= $v . " ";
            }
            $object["RAW"] = $raw_object;
        } else {
            $object["CONCISE"] = "";
            $object["RAW"] = "";
        }
        return $object;
    }
    /**
     * Takes a parse tree of a phrase and computes subject, predicate, and
     * object arrays. Each of these array consists of two components CONCISE and
     * RAW, CONCISE corresponding to something more similar to the words in the
     * original phrase and RAW to the case where extraneous words have been
     * removed
     *
     * @param  array $parse_tree a parse tree for a sentence
     * @return array triplet array
     */
    public static function extractTripletsParseTree($parse_tree)
    {
        $triplets = [];
        $triplets["subject"] = self::extractSubjectParseTree($parse_tree);
        $triplets["object"] = self::extractObjectParseTree($parse_tree);
        $triplets["predicate"] = self::extractPredicateParseTree($parse_tree);
        return $triplets;
    }
    /**
     * Takes a triplets array with subject, predicate, object fields with
     * CONCISE and RAW subfields and rearranges it to have two fields CONCISE
     * and RAW with subject, predicate, object, and QUESTION_ANSWER_LIST
     * subfields
     *
     * @param array $sub_pred_obj_triplets in format described above
     * @return array $processed_triplets in format described above
     */
    public static function rearrangeTripletsByType($sub_pred_obj_triplets)
    {
        $processed_triplet = [];
        $processed_triplets["CONCISE"] =
            self::extractTripletByType($sub_pred_obj_triplets, "CONCISE");
        $processed_triplets["RAW"] =
            self::extractTripletByType($sub_pred_obj_triplets, "RAW");
        return $processed_triplets;
    }
    /**
     * Takes a triplets array with subject, predicate, object fields with
     * CONCISE, RAW subfields and produces triplets with $type subfield
     * where $type is one of CONCISE and RAW and with subject, predicate,
     * object and QUESTION_ANSWER_LIST subfields
     *
     * @param array $sub_pred_obj_triplets  in format described above
     * @param string $type either CONCISE or RAW
     * @return array $triplets in format described above
     */
    public static function extractTripletByType($sub_pred_obj_triplets, $type)
    {
        $triplets = [];
        if (!empty($sub_pred_obj_triplets["subject"][$type])
            && !empty($sub_pred_obj_triplets["predicate"][$type])
            && !empty($sub_pred_obj_triplets["object"][$type])) {
            $question_answer_triplets = [];
            $question_marker = self::$question_marker;
            $sentence = [$sub_pred_obj_triplets["subject"][$type],
                    $sub_pred_obj_triplets["object"][$type],
                    $sub_pred_obj_triplets["predicate"][$type]];
            $question_triplets = [];
            for ($j = 0; $j < 2; $j++) {
                for ($i = 0; $i < 3; $i++) {
                    $question = $sentence;
                    $question[$i] = $question_marker;
                    $question_string = implode(" ", $question);
                    $question_string = trim($question_string);
                    $question_string = preg_replace("/\s+/u", " ",
                        $question_string);
                    $question_triplets[] = $question_string;
                    $question_answer_triplets[$question_string] =
                        preg_replace("/\s+/u", " ", $sentence[$i]);
                }
            }
            $triplets["QUESTION_LIST"] = $question_triplets;
            $triplets["QUESTION_ANSWER_LIST"] = $question_answer_triplets;
        }
        return $triplets;
    }
    /**
     * Takes tagged question string starts with Who
     * and returns question triplet from the question string
     *
     * @param string $tagged_question part-of-speech tagged question
     * @param int $index current index in statement
     * @return array parsed triplet
     */
    public static function parseQuestion($tagged_question, $index)
    {
        $generated_questions = [];
        $question_marker = trim(self::getQuestionMarker());
        $triplets = [];
        $tree_np = self::extractNounPhrase($tagged_question,
            ["cur_node" => 0]);
        $triplets["subject"] = self::extractSubjectParseTree($tree_np);
        $tree_vp = self::extractVerbPhrase($tagged_question,
            ["cur_node" => $index+1]);
        $triplets["predicate"] = self::extractPredicateParseTree($tree_vp);
        $triplet_types = ["CONCISE", "RAW"];
        foreach ($triplet_types as $type) {
            if (!empty($triplets["subject"][$type])
                && !empty($triplets["predicate"][$type])) {
                $question = trim (trim($triplets["subject"][$type]) .
                    " " . $question_marker .
                    " " . trim($triplets["predicate"][$type]));
                $question = preg_replace("/\s+/u", " ", $question);
                $generated_questions[$type][] = $question;
            }
        }
        return $generated_questions;
    }
    /**
     * Takes a phrase query entered by user and return true if it is question
     * and false if not
     *
     * @param $phrase any statement
     * @return bool returns true if statement is question
     */
    public function isQuestion($phrase)
    {
        $phrase = trim($phrase);
        for ($i = 0; $i < count(self::$questions); $i++) {
            if (mb_strpos($phrase, trim(self::$questions[$i])) !== false) {
                return true;
            }
        }
        return false;
    }
    /**
     * The function returns the question marker for the locale
     *
     * @return the question marker
     */
    public static function getQuestionMarker()
    {
        return self::$question_marker;
    }
    /**
     * Takes questions and returns the triplet from the question
     *
     * @param string $question question to parse
     * @return array question triplet
     */
    public static function questionParser($question)
    {
        $question = trim($question);
        $question = preg_replace("/\s+/u", " ", $question);
        $tagged_question = self::tagTokenizePartOfSpeech($question);
        $index = -1;
        foreach ($tagged_question as $i => $term_pos) {
            if (in_array($term_pos["token"], self::$questions)) {
                $index = $i;
                $term_pos["tag"] = "p_wh";
                $tagged_question[$i] = $term_pos;
                break;
            }
        }
        return self::parseQuestion($tagged_question, $index);
    }
}

ViewGit