Last commit for src/locale/en_US/resources/Tokenizer.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\locale\en_US\resources;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library\PhraseParser;
use seekquarry\yioop\library as L;

/**
 * This class has a collection of methods for English locale specific
 * tokenization. In particular, it has a stemmer, a stop word remover (for
 * use mainly in word cloud creation), and a part of speech tagger (for
 * question answering). The stemmer is my stab at implementing the
 * Porter Stemmer algorithm
 * presented http://tartarus.org/~martin/PorterStemmer/def.txt
 * The code is based on the non-thread safe C version given by Martin Porter.
 * Since PHP is single-threaded this should be okay.
 * Here given a word, its stem is that part of the word that
 * is common to all its inflected variants. For example,
 * tall is common to tall, taller, tallest. A stemmer takes
 * a word and tries to produce its stem.
 *
 * @author Chris Pollett
 */
class Tokenizer
{
    /**
     * Words we don't want to be stemmed
     * @var array
     */
    public static $no_stem_list = ["titanic", "programming", "fishing", 'ins',
        "blues", "factorial", "pbs"];
    /**
     * Phrases we would like yioop to rewrite before performing a query
     * @var array
     */
    public static $semantic_rewrites = [
        "ins" => 'uscis',
        "mimetype" => 'mime',
        "military" => 'armed forces',
        'full metal alchemist' => 'fullmetal alchemist',
        'bruce schnier' => 'bruce schneier',
        'dragonball' => 'dragon ball',
    ];
    /**
     * Any unique identifier corresponding to the component of a triplet which
     * can be answered using a question answer list
     * @string
     */
    public static $question_token = "qqq";
    /**
     * List of adjective-like parts of speech that might appear in lexicon file
     * @array
     */
    public static $adjective_type = ["JJ", "JJR", "JJS"];
    /**
     * List of adverb-like parts of speech that might appear in lexicon file
     * @array
     */
    public static $adverb_type = ["RB", "RBR", "RBS"];
    /**
     * List of conjunction-like parts of speech that might appear in lexicon
     * file
     * @array
     */
    public static $conjunction_type = ["CC"];
    /**
     * List of determiner-like parts of speech that might appear in lexicon
     * file
     * @array
     */
    public static $determiner_type = ["DT", "PDT"];
    /**
     * List of noun-like parts of speech that might appear in lexicon file
     * @array
     */
    public static $noun_type = ["NN", "NNS", "NNP", "NNPS", "PRP"];
    /**
     * List of verb-like parts of speech that might appear in lexicon file
     * @array
     */
    public static $verb_type = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"];
    /**
     * A list of frequently occurring terms for this locale which should
     * be excluded from certain kinds of queries
     * @array
     */
    public static $stop_words = ['a','able','about','above','abst',
        'accordance','according','based','accordingly','across','act',
        'actually','added','adj','affected','affecting','affects','after',
        'afterwards','again','against','ah','all','almost','alone','along',
        'already','also','although','always','am','among','amongst','an','and',
        'announce','another','any','anybody','anyhow','anymore','anyone',
        'anything','anyway','anyways','anywhere','apparently','approximately',
        'are','aren','arent','arise','around','as','aside','ask','asking','at',
        'auth','available','away','awfully','b','back','be','became','because',
        'become','becomes','becoming','been','before','beforehand','begin',
        'beginning','beginnings','begins','behind','being','believe','below',
        'beside','besides','between','beyond','biol','both','brief','briefly',
        'but','by','c','ca','came','can','cannot','cant','cause','causes',
        'certain','certainly', 'click', 'co','com','come',
        'comment', 'comments', 'comes','contain','containing',
        'contains','could','couldnt','d','date','did','didnt',
        'different','do','does','doesnt','doing',
        'done','dont','down','downwards',
        'due','during','e','each','ed','edu','effect','eg','eight','eighty',
        'either','else','elsewhere','end',
        'ending','enough','especially','et',
        'et-al','etc','even','ever','every',
        'everybody','everyone','everything'
        ,'everywhere','ex','except','f','far','few','ff','fifth','first',
        'five','fix','followed','following','follows','for','former',
        'formerly','forth','found','four','from','further','furthermore',
        'g','gave','get','gets','getting','give','given','gives','giving','go',
        'goes','gone','got','gotten','h','had','happens','hardly','has',
        'hasnt','have','havent','having','he','hed','hence','her','here',
        'hereafter','hereby','herein','heres','hereupon','hers','herself',
        'hes','hi','hid','him','himself','his','hither','home','how','howbeit',
        'however', 'http', 'https', 'hundred','i','id','ie','if','ill',
        'im','immediate','immediately',
        'importance','important','in','inc','indeed','index','information',
        'instead','into','invention','inward','is','isnt','it','itd','itll',
        'its','itself','ive','j','just','k','keep','keeps',
        'kept','kg','km','know',
        'known','knows','l','largely','last','lately',
        'later','latter','latterly',
        'least','less','lest','let','lets','like','liked','likely','line',
        'little','ll','look','looking','looks','ltd','m','made','mainly',
        'make','makes','many','may','maybe','me','mean','means','meantime',
        'meanwhile','merely','mg','might','million','miss','ml','more',
        'moreover','most','mostly','mr','mrs','much','mug','must','my',
        'myself','n','na','name','namely','nay','nd','near','nearly',
        'necessarily','necessary','need','needs','neither','never',
        'nevertheless','new','next','nine','ninety','no',
        'nobody','non','none','nonetheless','noone',
        'nor','normally','nos','not',
        'noted','nothing','now','nowhere','o','obtain',
        'obtained','obviously','of',
        'off','often','oh','ok','okay','old','omitted','on','once','one',
        'ones','only','onto','or','ord','other','others',
        'otherwise','ought','our','ours',
        'ourselves','out','outside','over','overall','owing','own','p','page',
        'pages','part','particular','particularly',
        'past','per','perhaps','placed',
        'please','plus','poorly','possible','possibly','potentially','pp',
        'predominantly','present','previously',
        'primarily','probably','promptly',
        'proud','provides','put','q','que','quickly','quite','qv', 'quot',
        'r','ran',
        'rather','rd','re','readily','really','recent','recently','ref','refs',
        'regarding','regardless','regards','related','relatively','research',
        'respectively','resulted','resulting',
        'results','right','run','s','said',
        'same','saw','say','saying','says','sec',
        'section','see','seeing','seem',
        'seemed','seeming','seems',
        'seen','self','selves','sent','seven','several',
        'shall','she','shed','shell',
        'shes','should','shouldnt','show','showed','shown','showns','shows',
        'significant','significantly','similar','similarly','since',
        'six','slightly',
        'so','some','somebody','somehow','someone','somethan',
        'something','sometime',
        'sometimes','somewhat','somewhere','soon',
        'sorry','specifically','specified',
        'specify','specifying','still','stop','strongly','sub','substantially',
        'successfully','such','sufficiently','suggest','sup','sure','t','take',
        'taken','taking','tell','tends','th','than',
        'thank','thanks','thanx','that',
        'thatll','thats','thatve','the','their',
        'theirs','them','themselves','then',
        'thence','there','thereafter','thereby','thered','therefore','therein',
        'therell','thereof','therere','theres','thereto','thereupon','thereve',
        'these','they','theyd','theyll','theyre',
        'theyve','think','this','those',
        'thou','though','thoughh','thousand','throug',
        'through','throughout','thru',
        'thus','til', 'till','tip','to','together','too',
        'took','toward','towards','tried',
        'tries','truly','try','trying','ts','twice','two','u','un','under',
        'unfortunately','unless','unlike','unlikely','until','unto','up',
        'upon','ups','us','use','used','useful','usefully','usefulness',
        'uses','using','usually','v','value','various','ve','very',
        'via','viz','vol','vols','vs',
        'w','want','wants','was','wasnt','way','we',
        'wed','welcome','well','went',
        'were','werent','weve','what','whatever',
        'whatll','whats','when','whence',
        'whenever','where','whereafter','whereas','whereby','wherein','wheres',
        'whereupon','wherever','whether','which','while','whim','whither',
        'who','whod','whoever','whole','wholl','whom','whomever','whos',
        'whose','why','widely', 'will', 'willing','wish','with','within',
        'without','wont','words','world',
        'would','wouldnt','www','x','y','yes','yet','you','youd','youll',
        'your','youre','yours','yourself','yourselves','youve','z','zero'];
    /**
     * storage used in computing the stem
     * @var string
     */
    private static $buffer;
    /**
     * Index of the current end of the word at the current state of computing
     * its stem
     * @var int
     */
    private static $k;
    /**
     * Index to start of the suffix of the word being considered for
     * manipulation
     * @var int
     */
    private static $j;
    /**
     * Do any global set up for tokenizer (none in the case of en-US)
     */
    public function __construct()
    {
    }
    /**
     * Stub function which could be used for a word segmenter.
     * Such a segmenter on input thisisabunchofwords would output
     * this is a bunch of words
     *
     * @param string $pre_segment  before segmentation
     * @return string should return string with words separated by space
     *     in this case does nothing
     */
    public static function segment($pre_segment)
    {
        return $pre_segment;
    }
    /**
     * Removes the stop words from the page (used for Word Cloud generation)
     *
     * @param mixed $data either a string or an array of string to remove
     *      stop words from
     * @return mixed $data with no stop words
     */
    public static function stopwordsRemover($data)
    {
        static $pattern = "";
        if (empty($pattern)) {
            $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/ui';
        }
        $data = preg_replace($pattern, '', $data);
        return $data;
    }
    /**
     * This methods tries to handle punctuation in terms specific to the
     * English language such as abbreviations.
     *
     * @param string &$string a string of words, etc which might involve such
     *      terms
     */
    public function canonicalizePunctuatedTerms(&$string)
    {
        static $substitutions = [
            //abbreviated titles
            "/([mM]r|[mM]rs|[mM]s|[dD]r|dD]rs|[iI]n|".
            "[cC]apt|[cC]pl|sS]t|fF]t|[vV]s)\.(\s*)(\p{Lu}|\Z)/u" => '$1 $3',
            "/,(\p{Lu})\.(\s*)(\p{Lu}|\Z)/u" => '$1_ $3',
            "/gimme/i" => "give me",
            "/gonna/i" => "going to",
            "/gotta/i" => "got to",
            "/ma\'am/i" => "madam",
            "/\'tis/i" => "it is",
            "/\'twas/i" => "it was",
            "/y\'all/i" => "you all",
            "/I\'m/" => "I am",
            "/I ain\'t/" => "I am not",
            "/You ain\'t/i" => "you are not",
            "/can\'t/i" => "cannot",
            "/(why|who|which|when|what|this|that|there|how|" .
                "it|everyone|one|he|she)\'s/i" => "$1 is",
            "/is been/" => "has been",
            "/is had/" => "has had",
            // shan't
            "/shan\'t/" => "shall not",
            // contractions with not
            "/\b(\p{L}+)\'d/" => ' $1 would',
            // contractions with not
            "/\b(\p{L}+)n\'t/" => ' $1 not',
            // contractions with will
            "/\b(\p{L}+)\'ll/" => ' $1 will',
            // contractions with have
            "/\b(\p{L}+)\'ve/" => ' $1 have',
            // contractions with have
            "/\b(\p{L}+)\'re/" => ' $1 are',
            "/\b(\p{L}+)\'s/" => ' $1_pos_s'
        ];
        static $patterns = [];
        static $replacements = [];
        if (empty($patterns)) {
            $patterns = array_keys($substitutions);
            $replacements = array_values($substitutions);
        }
        $string = preg_replace($patterns, $replacements, $string);
    }
    /**
     * Takes a phrase and tags each term in it with its part of speech.
     * So each term in the original phrase gets mapped to term~part_of_speech
     * This tagger is based on a Brill tagger. It makes uses a lexicon
     * consisting of words from the Brown corpus together with a list of
     * part of speech tags that that word had in the Brown Corpus. These are
     * used to get an initial part of speech (in word was not present than
     * we assume it is a noun). From this a fixed set of rules is used to
     * modify the initial tag if necessary.
     *
     * @param string $phrase text to add parts speech tags to
     * @param bool $with_tokens whether to include the terms and the tags
     *      in the output string or just the part of speech tags
     * @return string $tagged_phrase phrase where each term has ~part_of_speech
     *      appended ($with_tokens == true) or just space separated
     *      part_of_speech (!$with_tokens)
     */
    public static function tagPartsOfSpeechPhrase($phrase, $with_tokens = true)
    {
        $tagged_tokens = self::tagTokenizePartOfSpeech($phrase);
        $tagged_phrase  = self::taggedPartOfSpeechTokensToString(
            $tagged_tokens, $with_tokens);
        return $tagged_phrase;
    }
    /**
     * Split input text into terms and output an array with one element
     * per term, that element consisting of array with the term token
     * and the part of speech tag.
     *
     * @param string $text string to tag and tokenize
     * @return array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term) for one each token in $text
     */
   public static function tagTokenizePartOfSpeech($text)
    {
        /* if run as own server dictionary only loaded once for all requests */
        static $dictionary = [];
        $lexicon_file = C\LOCALE_DIR . "/en_US/resources/lexicon.txt.gz";
        if (empty($dictionary)) {
            if (file_exists($lexicon_file)) {
                $lex_data = gzdecode(file_get_contents($lexicon_file));
                preg_match_all("/([^\s\,]+)[\s|\,]+([^\n]+)/u",
                    $lex_data, $lex_parts);
                $dictionary = array_combine($lex_parts[1], $lex_parts[2]);
            }
        }
        preg_match_all("/[\w\d]+/", $text, $matches);
        $tokens = $matches[0];
        $nouns = ['NN', 'NNS', 'NNP'];
        $verbs = ['VBD', 'VBP', 'VB'];
        $result = [];
        $previous = ['token' => -1, 'tag' => -1];
        $previous_token = -1;
        // now using our dictionary we tag
        $i = 0;
        $tag_list = [];
        foreach ($tokens as $token) {
            $prev_tag_list = $tag_list;
            $tag_list = [];
            // default to a common noun
            $current = ['token' => $token, 'tag' => 'NN'];
            // remove trailing full stops
            $token = mb_strtolower($token);
            if ($token == "howto") {
                $current['tag'] = 'WP';
            } else if (!empty($dictionary[$token])) {
                $tag_list = explode(" ", $dictionary[$token]);
                $current['tag'] = $tag_list[0];
            }
            // Converts verbs after 'the' to nouns
            if ($previous['token'] == 'the' &&
                in_array($current['tag'], $verbs)){
                $current['tag'] = 'NN';
            }
            // Convert noun to number if . appears
            if ($current['tag'][0] == 'N' && strpos($token, '.') !== false) {
                $current['tag'] = 'CD';
            }
            $ends_with = substr($token, -2);
            switch ($ends_with) {
                case 'ed':
                    // Convert noun to past particle if ends with 'ed'
                    if ($current['tag'][0] == 'N') {
                        $current['tag'] = 'VBN';
                    }
                break;
                case 'ly':
                    // Anything that ends 'ly' is an adverb
                    $current['tag'] = 'RB';
                break;
                case 'al':
                    // Common noun to adjective if it ends with al
                    if (in_array($current['tag'], $nouns)) {
                        $current['tag'] = 'JJ';
                    }
                break;
            }
            // Noun to verb if the word before is 'would'
            if ($current['tag'] == 'NN' && $previous_token == 'would') {
                $current['tag'] = 'VB';
            }
            // Convert common noun to gerund
            if (in_array($current['tag'], $nouns) &&
                substr($token, -3) == 'ing') {
                $current['tag'] = 'VBG';
            }
            //nouns followed by adjectives
            if (in_array($previous['tag'], $nouns) &&
                $current['tag'] == 'JJ' && in_array('JJ', $prev_tag_list)) {
                $result[$i - 1]['tag'] = 'JJ';
                $current['tag'] = 'NN';
            }
            /* If we have a noun, and the second can be a verb,
             * convert to verb; if noun noun and previous could be an
             * adjective convert to adjective
             */
            if (in_array($previous['tag'], $nouns) &&
                in_array($current['tag'], $nouns) ) {
                if (in_array('VBN', $tag_list)) {
                    $current['tag'] = 'VBN';
                } else if (in_array('VBZ', $tag_list)) {
                    $current['tag'] = 'VBZ';
                } else if (in_array('JJ', $prev_tag_list)) {
                    $result[$i - 1]['tag'] = 'JJ';
                }
            }
            $result[$i] = $current;
            $i++;
            $previous = $current;
            $previous_token = $token;
        }
        return $result;
    }
    /**
     * Takes a phrase query entered by user and return true if it is question
     * and false if not
     *
     * @param $phrase any statement
     * @return bool returns true if statement is question
     */
    public function isQuestion($phrase)
    {
        $regex_starts_with_que =
            "/^(who|what|which|where|when|whose|how)(.*)$/";
        if (preg_match($regex_starts_with_que, trim($phrase))) {
            return true;
        }
        return false;
    }
    /**
     * Computes the stem of an English word
     *
     * For example, jumps, jumping, jumpy, all have jump as a stem
     *
     * @param string $word the string to stem
     * @return string the stem of $words
     */
    public static function stem($word)
    {
        if (in_array($word, self::$no_stem_list)) {
            return $word;
        }
        // for hyphenated words, only stem last word -- my change CP
        if (($last_hyphen = strrpos($word, "-")) > 0) {
            $before_hyphen = substr($word, 0, $last_hyphen + 1);
            $after_hyphen = substr($word, $last_hyphen + 1);
            return $before_hyphen . self::stem($after_hyphen);
        }
        self::$buffer = $word;
        self::$k = strlen($word) - 1;
        self::$j = self::$k;
        if (self::$k <= 1) {
            return $word;
        }
        self::step1ab();
        self::step1c();
        self::step2();
        self::step3();
        self::step4();
        self::step5();
        return substr(self::$buffer, 0, self::$k + 1);
    }
    /**
     * Take in a sentence and try to compress it to a smaller version
     * that "retains the most important information and remains grammatically
     * correct" (Jing 2000).
     *
     * @param string $sentence_to_compress the sentence to compress
     * @return the compressed sentence
     */
    public static function compressSentence($sentence_to_compress)
    {
        // patterns are based on From Back to Basics: CLASSY 2006 page 3:
        static $delete_patterns = [
            /*
              2. We remove many adverbs and all conjunctions,
              including phrases such as "As a matter of fact," and
              "At this point," that occur at the start  of a sentence.
             */
            "/^At this point,?\b/i", "/^As a matter of fact,?\b/i",
            "/^[a-zA-Z]*ly\b/i", "/(^and,?\b)|(^but,?\b)|(^for,?\b)|" .
            "(^nor,?\b)|(^or,?\b)|(^so,?\b)|(^yet,?\b)/i",
            /*
              3. We remove a small selections of words that occur in the middle
              of a sentence, such as ", however," and ", also," (not always
              requiring the commas).
             */
            "/(;|,)?\s*(nevertheless|today|tomorrow|soon|instead|in practice|" .
            "however|as a practical matter|further(more)?|".
            "as such|also)\s*(;|,)?\s*/i",
            /*
              4. For DUC 2006, we added the removal of ages such as ", 51," or
              ", aged 24,".
             */
            "/,\s?\d{1,3},/i", "/,\s?aged\s?\d{1,3},/i",
            /*
              6. We remove relative clause attributives (clauses beginning with
              "who(m)", "which", "when", and "where") wherever possible.
            */
            "/(,\s?whom?[^,]*,)|(,\s?which[^,]*,)|" .
            "(,\s?when[^,]*,)|(,\s?where[^,]*,)/i"
        ];
        return preg_replace($delete_patterns, " ", $sentence_to_compress);
    }
    /**
     * Takes a triplets array with subject, predicate, object fields with
     * CONCISE and RAW subfields and rearranges it to have two fields CONCISE
     * and RAW with subject, predicate, object, and QUESTION_ANSWER_LIST
     * subfields
     *
     * @param array $sub_pred_obj_triplets in format described above
     * @return array $processed_triplets in format described above
     */
    public static function rearrangeTripletsByType($sub_pred_obj_triplets)
    {
        $processed_triplet = [];
        $processed_triplets['CONCISE'] =
            self::extractTripletByType($sub_pred_obj_triplets, "CONCISE");
        $processed_triplets['RAW'] =
            self::extractTripletByType($sub_pred_obj_triplets, "RAW");
        return $processed_triplets;
    }
    /**
     * Starting at the $cur_node in a $tagged_phrase parse tree for an English
     * sentence, create a phrase string for each of the next nodes
     * which belong to part of speech group $type.
     *
     * @param array &$cur_node node within parse tree
     * @param array $tagged_phrase parse tree for phrase
     * @param string $type self::$noun_type, self::$verb_type, etc
     * @return string phrase string involving only terms of that $type
     */
    public static function parseTypeList(&$cur_node, $tagged_phrase, $type)
    {
        $string = "";
        $previous_string = "";
        $previous_tag = "";
        $start_node = $cur_node;
        $next_tag = (empty($tagged_phrase[$cur_node]['tag'])) ? "" :
            trim($tagged_phrase[$cur_node]['tag']);
        $allowed_conjuncts = [];
        while ($next_tag && (in_array($next_tag, $type) ||
            in_array($next_tag, $allowed_conjuncts))) {
            $previous_string = $string;
            $string .= " ". $tagged_phrase[$cur_node]['token'];
            $cur_node++;
            $allowed_conjuncts = self::$conjunction_type;
            $previous_tag = $next_tag;
            $next_tag = (empty($tagged_phrase[$cur_node]['tag'])) ? "" :
                trim($tagged_phrase[$cur_node]['tag']);
        }
        if (in_array($previous_tag, $allowed_conjuncts) && $start_node <
            $cur_node) {
            $cur_node--;
            $string = $previous_string;
        }
        return $string;
    }
    /**
     * Takes a part-of-speech tagged phrase and pre-tree with a
     * parse-from position and builds a parse tree for an adjective if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["cur_node" =>
     *      current parse position in $tagged_phrase]
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     *      "JJ" a subarray with a token node for the adjective that was
     *      parsed
     */
    public static function parseAdjective($tagged_phrase, $tree)
    {
        $adjective_string = self::parseTypeList($tree['cur_node'],
            $tagged_phrase, self::$adjective_type);
        if (!empty($adjective_string)) {
            $tree["JJ"] = $adjective_string;
        }
        return $tree;
    }
    /**
     * Takes a part-of-speech tagged phrase and pre-tree with a
     * parse-from position and builds a parse tree for a determiner if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["curnode" =>
     *      current parse position in $tagged_phrase]
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     *      "DT" a subarray with a token node for the determiner that was
     *      parsed
     */
    public static function parseDeterminer($tagged_phrase, $tree)
    {
        $determiner_string = "";
        /* In: All the cows low, "All the" is considered a determiner.
           That is, we will mush together the predeterminer with the determiner
         */
        $determiner_string = self::parseTypeList($tree['cur_node'],
            $tagged_phrase, self::$determiner_type);
        if (!empty($determiner_string)) {
           $tree["DT"] = $determiner_string;
        }
        return $tree;
    }
    /**
     * Takes a part-of-speech tagged phrase and pre-tree with a
     * parse-from position and builds a parse tree for a noun if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["curnode" =>
     *      current parse position in $tagged_phrase]
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     *      "NN" a subarray with a token node for the noun string that was
     *      parsed
     */
    public static function parseNoun($tagged_phrase, $tree)
    {
        //Combining multiple noun into one
        $noun_string = self::parseTypeList($tree['cur_node'], $tagged_phrase,
            self::$noun_type);
        if (!empty($noun_string)) {
            $tree["NN"] = $noun_string;
        }
        return $tree;
    }
    /**
     * Takes a part-of-speech tagged phrase and pre-tree with a
     * parse-from position and builds a parse tree for a verb if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["curnode" =>
     *      current parse position in $tagged_phrase]
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     *      "VB" a subarray with a token node for the verb string that was
     *      parsed
     */
    public static function parseVerb($tagged_phrase, $tree)
    {
        $verb_string = self::parseTypeList($tree['cur_node'], $tagged_phrase,
            self::$verb_type);
        if (!empty($verb_string)) {
            $tree["VB"] = $verb_string;
        }
        return $tree;
    }
    /**
     * Takes a part-of-speech tagged phrase and pre-tree with a
     * parse-from position and builds a parse tree for a sequence of
     * prepositional phrases if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["cur_node" =>
     *      current parse position in $tagged_phrase]
     * @param int $index which term in $tagged_phrase to start to try to parse
     *      a preposition from
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     *      parsed followed by additional possible fields (here i
     *      represents the ith clause found):
     *      "IN_i" with value a preposition subtree
     *      "DT_i" with value a determiner subtree
     *      "JJ_i" with value an adjective subtree
     *      "NN_i"  with value an additional noun subtree
     */
    public static function parsePrepositionalPhrases($tagged_phrase, $tree,
        $index = 1)
    {
        $cur_node = $tree['cur_node'];
        // Checking for preposition. I.e, format: prep [det] [adjective] noun
        if (isset($tagged_phrase[$cur_node]['tag']) &&
            trim($tagged_phrase[$cur_node]['tag']) == "IN") {
            /* can have multiple prep's in a row, for example,
               it is known in over 20 countries*/
            $preposition_string = self::parseTypeList($cur_node, $tagged_phrase,
                ["IN"]);
            if (!empty($preposition_string)) {
                $tree["IN_$index"] = $preposition_string;
            }
            $determiner_string = self::parseTypeList($cur_node, $tagged_phrase,
                self::$determiner_type);
            if (!empty($determiner_string)) {
                $tree["DT_$index"] = $determiner_string;
            }
            $adjective_string = self::parseTypeList($cur_node, $tagged_phrase,
                self::$adjective_type);
            if (!empty($adjective_string)) {
                $tree["JJ_$index"] = $adjective_string;
            }
            $prep_noun_string = self::parseTypeList($cur_node, $tagged_phrase,
                self::$noun_type);
            if ($prep_noun_string) {
                $tree["NP_$index"] = $prep_noun_string;
            }
            /* if have more than one phrase in a row:
               the drought happened in many countries over many years.
             */
            $tree_next = self::parsePrepositionalPhrases($tagged_phrase,
                ["cur_node" => $cur_node], $index + 1);
            unset($tree_next['cur_node']);
            $tree['PRP'] = $tree_next;
        }
        $tree['cur_node'] = $cur_node;
        return $tree;
    }
    /**
     * Takes a part-of-speech tagged phrase and pre-tree with a
     * parse-from position and builds a parse tree for a noun phrase if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["curnode" =>
     *      current parse position in $tagged_phrase]
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     *      "NP" a subarray with possible fields
     *      "DT" with value a determiner subtree
     *      "JJ" with value an adjective subtree
     *      "NN" with value a noun tree
     */
    public static function parseNounPhrase($tagged_phrase, $tree)
    {
        $cur_node = $tree['cur_node'];
        $tree_dt = self::parseDeterminer($tagged_phrase,
            ['cur_node' => $cur_node]);
        $tree_jj = self::parseAdjective($tagged_phrase,
            ['cur_node' => $tree_dt['cur_node']]);
        $tree_nn = self::parseNoun($tagged_phrase,
            ['cur_node' => $tree_jj['cur_node']]);
        if ($tree_nn['cur_node'] == $cur_node) {
            $tree['NP'] = "";
            return $tree;
        }
        $tree_pp = self::parsePrepositionalPhrases($tagged_phrase,
            ['cur_node' => $tree_nn['cur_node']]);
        $tree_aux = self::parseAuxClause($tagged_phrase,
            ['cur_node' => $tree_pp['cur_node']]);
        $cur_node = $tree_aux['cur_node'];
        $cc = "";
        if (!empty($tagged_phrase[$cur_node]['tag']) &&
            in_array($tagged_phrase[$cur_node]['tag'],
            self::$conjunction_type)) {
            $cc = $tagged_phrase[$cur_node]['token'];
            $cur_node++;
        }
        $tree_np = self::parseNounPhrase($tagged_phrase,
            ['cur_node' => $cur_node]);
        if ($tree_np['cur_node'] == $cur_node && $cc) {
            $cur_node--;
            $tree_np = [];
            $cc = "";
        } else {
            $cur_node = $tree_np['cur_node'];
        }
        unset($tree_dt['cur_node'], $tree_jj['cur_node'],
            $tree_nn['cur_node'], $tree_pp['cur_node'],
            $tree_aux['cur_node'], $tree_np['cur_node']);
        $sub_tree = ['DT' => $tree_dt, 'JJ' => $tree_jj, 'NN' => $tree_nn,
            'PRP' => $tree_pp, 'AUX' => $tree_aux, 'CC' => $cc,
            'ADD_NP' => $tree_np];
        return ['cur_node' => $cur_node, 'NP' => $sub_tree];
    }
    /**
     * Takes a part-of-speech tagged phrase and pre-tree with a
     * parse-from position and builds a parse tree for a verb phrase if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["curnode" =>
     *      current parse position in $tagged_phrase]
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     *      "VP" a subarray with possible fields
     *      "VB" with value a verb subtree
     *      "NP" with value an noun phrase subtree
     */
    public static function parseVerbPhrase($tagged_phrase, $tree)
    {
        $cur_node = $tree['cur_node'];
        $adverb_string = self::parseTypeList($cur_node, $tagged_phrase,
            self::$adverb_type);
        $tree_vb = self::parseVerb($tagged_phrase, ['cur_node' => $cur_node]);
        if ($cur_node == $tree_vb['cur_node']) {
            // if no verb return what started with
            return $tree;
        }
        $cur_node = $tree_vb['cur_node'];
        $add_to_adverb_string = self::parseTypeList($cur_node,
            $tagged_phrase, self::$adverb_type);
        if (trim($add_to_adverb_string) != 'very') {
            $adverb_string .= $add_to_adverb_string;
            $tree_vb['cur_node'] = $cur_node;
        } else {
            $tagged_phrase[$tree_vb['cur_node']]['tag'] = 'JJ';
        }
        $tree_np = self::parseNounPhrase($tagged_phrase,
            ['cur_node' => $tree_vb['cur_node']]);
        $adverb_string .= self::parseTypeList($tree_np['cur_node'],
            $tagged_phrase, self::$adverb_type);
        if (!empty($adverb_string)) {
            $tree_vb["RB"] = $adverb_string;
        }
        $cur_node = $tree_np['cur_node'];
        if (!empty($tree_np['NP'])) {
            unset($tree_vb['cur_node'], $tree_np['cur_node']);
            return ['VP' => ['VB' => $tree_vb, 'NP' => $tree_np['NP']],
                'cur_node' => $cur_node];
        }
        unset($tree_vb['cur_node']);
        return ['VP' => ['VB' => $tree_vb], 'cur_node' => $cur_node];
    }
    /**
     * Given a part-of-speeech tagged phrase array generates a parse tree
     * for the phrase using a recursive descent parser.
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param $tree that consists of ["curnode" =>
     *      current parse position in $tagged_phrase]
     * @return array used to represent a tree. The array has up to three fields
     *      $tree["cur_node"] index of how far we parsed our$tagged_phrase
     *      $tree["NP"] contains a subtree for a noun phrase
     *      $tree["VP"] contains a subtree for a verb phrase
     */
    public static function parseWholePhrase($tagged_phrase, $tree)
    {
        // for example: In the dark of winter, he walked silently.
        $tree_start = self::parsePrepositionalPhrases($tagged_phrase,
            ["cur_node" => $tree['cur_node']]);
        $cur_node = empty($tree_start['cur_node']) ? $tree['cur_node'] :
            $tree_start['cur_node'];
        unset($tree_start['cur_node']);
        //cur_node is the index in tagged_phrase we've parse to so far
        $tree_np = self::parseNounPhrase($tagged_phrase,
            ["cur_node" => $cur_node]);
        if ($tree_np['cur_node'] == $cur_node) {
            return $tree;
        }
        $tree_vp = self::parseVerbPhrase($tagged_phrase,
            ["cur_node" => $tree_np['cur_node']]);
        if ($tree_np['cur_node'] == $tree_vp['cur_node']) {
            return $tree;
        }
        $cur_node = $tree_vp['cur_node'];
        unset($tree_np['cur_node'], $tree_vp['cur_node']);
        if (!empty($tree_start) && !empty($tree_np['NP'])) {
            $tree_np['NP']['PRP-1'] = $tree_start;
        }
        return ['cur_node' => $cur_node, 'NP' => $tree_np['NP'],
            'VP' => $tree_vp['VP']];
    }
    /**
     * Takes a part-of-speech tagged phrase and pre-tree with a
     * parse-from position and builds a parse tree for a auxiliary clause
     * if possible
     *
     * @param array $tagged_phrase
     *      an array of pairs of the form ("token" => token_for_term,
     *     "tag"=> part_of_speech_tag_for_term)
     * @param array $tree that consists of ["cur_node" =>
     *      current parse position in $tagged_phrase]
     * @return array has fields
     *      "cur_node" index of how far we parsed $tagged_phrase
     */
    public static function parseAuxClause($tagged_phrase, $tree)
    {
        $cur_node = $tree["cur_node"];
        $token = empty($tagged_phrase[$cur_node]["token"]) ? "" :
            trim($tagged_phrase[$cur_node]["token"]);
        if (!in_array($token, ["that", "who", "which", "because", "like",
            "as"])) {
            return $tree;
        }
        $cur_node++;
        $tree_vp = self::parseVerbPhrase($tagged_phrase,
            ["cur_node" => $cur_node]);
        if ($cur_node != $tree_vp['cur_node']) {
            $cur_node = $tree_vp['cur_node'];
            unset($tree_vp['cur_node']);
            return ['cur_node' => $cur_node,
                'IN' => $token, 'PHRASE' => $tree_vp];
        }
        $tree_wp = self::parseWholePhrase($tagged_phrase,
            ["cur_node" => $cur_node]);
        if ($tree_wp['cur_node'] == $cur_node) {
            return $tree;
        }
        $cur_node = $tree_wp['cur_node'];
        unset($tree_wp['cur_node']);
        return ['cur_node' => $cur_node,
            'IN' => $token, 'PHRASE' => $tree_wp];
    }
    /**
     * Takes a parse tree of a phrase and computes subject, predicate, and
     * object arrays. Each of these array consists of two components CONCISE and
     * RAW, CONCISE corresponding to something more similar to the words in the
     * original phrase and RAW to the case where extraneous words have been
     * removed
     *
     * @param are $tree a parse tree for a sentence
     * @return array triplet array
     */
    public static function extractTripletsParseTree($tree)
    {
        $triplets = [];
        $triplets['subject'] = self::extractSubjectParseTree($tree);
        $triplets['predicate'] = self::extractPredicateParseTree($tree);
        $triplets['object'] = self::extractObjectParseTree($tree);
        return $triplets;
    }
    /**
     * Scans a word list for phrases. For phrases found generate
     * a list of question and answer pairs at two levels of granularity:
     * CONCISE (using all terms in original phrase) and RAW (removing
     * (adjectives, etc).
     *
     * @param array $word_and_phrase_list of statements
     * @return array with two fields: QUESTION_LIST consisting of triplets
     *      (SUBJECT, PREDICATES, OBJECT) where one of the components has been
     *      replaced with a question marker.
     */
    public static function extractTripletsPhrases($word_and_phrase_list)
    {
        $triplets_list = [];
        $question_list = [];
        $question_answer_list = [];
        $word_and_phrase_list = array_filter($word_and_phrase_list,
            function ($key) {
                return str_word_count($key) >= C\PHRASE_THRESHOLD;
            }, \ARRAY_FILTER_USE_KEY );
        $triplet_types = ['CONCISE', 'RAW'];
        foreach ($word_and_phrase_list as $word_and_phrase => $position_list) {
            $word_and_phrase = self::compressSentence($word_and_phrase);
            // strip parentheticals
            $word_and_phrase = preg_replace("/[\{\[\(][^\}\]\)]+[\}\]\)]/u",
                "", $word_and_phrase);
            $tagged_phrase = self::tagTokenizePartOfSpeech($word_and_phrase);
            $parse_tree = self::parseWholePhrase($tagged_phrase,
                ['cur_node' => 0]);
            $triplets = self::extractTripletsParseTree($parse_tree);
            $extracted_triplets = self::rearrangeTripletsByType($triplets);
            foreach ($triplet_types as $type) {
                if (!empty($extracted_triplets[$type])) {
                    $triplets = $extracted_triplets[$type];
                    $questions = $triplets['QUESTION_LIST'];
                    foreach ($questions as $question) {
                        $question_list[$question] = $position_list;
                    }
                    $question_answer_list = array_merge($question_answer_list,
                        $triplets['QUESTION_ANSWER_LIST']);
                }
            }
        }
        $out_triplets['QUESTION_LIST'] = $question_list;
        $out_triplets['QUESTION_ANSWER_LIST'] = $question_answer_list;
        return $out_triplets;
    }
    /**
     * Takes phrase tree $tree and a part-of-speech $pos returns
     * the deepest $pos only path in tree.
     *
     * @param array $tree phrase to extract type from
     * @param string $pos the part of speech to extract
     * @return string the label of deepest $pos only path in $tree
     */
    public static function extractDeepestSpeechPartPhrase($tree, $pos)
    {
        $extract = "";
        if (!empty($tree[$pos])) {
            $extract = self::extractDeepestSpeechPartPhrase($tree[$pos], $pos);
        }
        if (!$extract && !empty($tree[$pos]) && !empty($tree[$pos][$pos])) {
            $extract = $tree[$pos][$pos];
        }
        return $extract;
    }
    /**
     * Takes a parse tree of a phrase or statement and returns an array
     * with two fields CONCISE and RAW the former having the object of
     * the original phrase (as a string) the latter having the importart
     * parts of the object
     *
     * @param array representation of a parse tree of a phrase
     * @return array with two fields CONCISE and RAW as described above
     */
    public static function extractObjectParseTree($tree)
    {
        $object = [];
        if (!empty($tree['VP'])) {
            $tree_vp = $tree['VP'];
            if (!empty($tree_vp['NP'])) {
                $nb = $tree_vp['NP'];
                $object['CONCISE'] = self::extractDeepestSpeechPartPhrase($nb,
                    "NN");
                $raw_object = "";
                $it = new \RecursiveIteratorIterator(
                    new \RecursiveArrayIterator($nb));
                foreach ($it as $v) {
                    $raw_object .= $v . " ";
                }
                $object['RAW'] = $raw_object;
            } else {
                $object['CONCISE'] = "";
                $object['RAW'] = "";
            }
        } else {
            $object['CONCISE'] = "";
            $object['RAW'] = "";
        }
        return $object;
    }
    /**
     * Takes a parse tree of a phrase or statement and returns an array
     * with two fields CONCISE and RAW the former having the predicate of
     * the original phrase (as a string) the latter having the importart
     * parts of the predicate
     *
     * @param array representation of a parse tree of a phrase
     * @return array with two fields CONCISE and RAW as described above
     */
    public static function extractPredicateParseTree($tree)
    {
        $predicate = [];
        if (!empty($tree['VP'])) {
            $tree_vp = $tree['VP'];
            $predicate['CONCISE'] = self::extractDeepestSpeechPartPhrase(
                $tree_vp, "VB");
            $raw_predicate = "";
            if (!empty($tree_vp['VB'])) {
                $tree_vb = $tree_vp['VB'];
                $it = new \RecursiveIteratorIterator(
                    new \RecursiveArrayIterator($tree_vb));
                foreach ($it as $v) {
                    $raw_predicate .= $v . " ";
                }
                $predicate['RAW'] = $raw_predicate;
            }
        } else {
            $predicate['CONCISE'] = "";
            $predicate['RAW'] = "";
        }
        return $predicate;
    }
    /**
     * Takes a parse tree of a phrase or statement and returns an array
     * with two fields CONCISE and RAW the former having the subject of
     * the original phrase (as a string) the latter having the importart
     * parts of the subject
     *
     * @param array representation of a parse tree of a phrase
     * @return array with two fields CONCISE and RAW as described above
     */
    public static function extractSubjectParseTree($tree)
    {
        $subject = [];
        if (!empty($tree['NP'])) {
            $subject['CONCISE'] = self::extractDeepestSpeechPartPhrase(
                $tree['NP'], "NN");
            $raw_subject = "";
            $it = new \RecursiveIteratorIterator(
                new \RecursiveArrayIterator($tree['NP']));
            foreach ($it as $v) {
                $raw_subject .= $v . " ";
            }
            $subject['RAW'] = $raw_subject;
        } else {
            $subject['CONCISE'] = "";
            $subject['RAW'] = "";
        }
        return $subject;
    }
    /**
     * Takes tagged question string starts with Who
     * and returns question triplet from the question string
     *
     * @param string $tagged_question part-of-speech tagged question
     * @param int $index current index in statement
     * @return array parsed triplet
     */
    public static function parseWhoQuestion($tagged_question, $index)
    {
        $generated_questions = [];
        $tree = ["cur_node" => $index];
        $tree['NP'] = "WHO";
        $triplets = [];
        $tree_vp = self::parseVerbPhrase($tagged_question, $tree);
        $triplets['predicate'] = self::extractPredicateParseTree(
            $tree_vp);
        $triplets['object'] = self::extractObjectParseTree(
            $tree_vp);
        $triplet_types = ['CONCISE', 'RAW'];
        foreach ($triplet_types as $type) {
            if (!empty($triplets['object'][$type])
                && !empty($triplets['predicate'][$type])) {
                $generated_questions[$type][] =
                    trim($triplets['object'][$type]) .
                    " " . trim($triplets['predicate'][$type]) . " " .
                    self::$question_token;
                $generated_questions[$type][] = self::$question_token .
                    " " . trim($triplets['predicate'][$type]) .
                    " " . trim($triplets['object'][$type]);
            }
        }
        return $generated_questions;
    }
    /**
     * Takes tagged question string starts with Wh+ except Who
     * and returns question triplet from the question string
     * Unlike the WHO case, here we assume there is an auxliary verb
     * followed by a noun phrase then the rest of the verb phrase. For example,
     * Where is soccer played?
     *
     * @param string $tagged_question part-of-speech tagged question
     * @param $index current index in statement
     * @return array parsed triplet suitable for query look-up
     */
    public static function parseWHPlusQuestion($tagged_question, $index)
    {
        $generated_questions = [];
        $aux_verb = "";
        while (isset($tagged_question[$index]) &&
            in_array(trim($tagged_question[$index]['tag']),
            self::$verb_type)) {
            $token = trim($tagged_question[$index]['token']);
            $aux_verb .= " " . $token;
            $index++;
        }
        $tree = ["cur_node" => $index];
        $tree['NP'] = "WHPlus";
        $triplets = [];
        $tree_np = self::parseNounPhrase($tagged_question, $tree);
        $triplets['subject'] = self::extractSubjectParseTree($tree_np);
        $tree_vp = self::parseVerbPhrase($tagged_question, $tree_np);
        $triplets['predicate'] = self::extractPredicateParseTree($tree_vp);
        if (!empty($aux_verb)) {
            if (!isset($triplets['predicate']['RAW'])) {
                $triplets['predicate']['RAW'] = "";
            }
            $triplets['predicate']['RAW'] = trim($aux_verb) .
                " " . $triplets['predicate']['RAW'];
        }
        $triplet_types = ['CONCISE', 'RAW'];
        foreach ($triplet_types as $type) {
            if (!empty($triplets['subject'][$type])&&
                !empty($triplets['predicate'][$type])) {
                $generated_questions[$type][] =
                    trim($triplets['subject'][$type]) .
                    " " . trim($triplets['predicate'][$type]) .
                    " " . self::$question_token;
                $generated_questions[$type][] = self::$question_token .
                    " " . trim($triplets['predicate'][$type]) .
                    " " . trim($triplets['subject'][$type]);
            }
        }
        return $generated_questions;
    }
    /**
     * Takes any question started with WH question and returns the
     * triplet from the question
     *
     * @param string $question question to parse
     * @return array question triplet
     */
    public static function questionParser($question)
    {
        $question = preg_replace("/how\s+to/", "howto", $question);
        $tagged_question = self::tagTokenizePartOfSpeech($question);
        $generated_questions = [];
        if (isset($tagged_question[0])) {
            if (in_array(trim($tagged_question[0]['tag']),
                ["WRB", "WP"])) {
                $token = strtoupper(trim($tagged_question[0]['token']));
                if ($token == "WHO") {
                    $generated_questions = self::parseWhoQuestion(
                        $tagged_question, 1);
                } else if (in_array($token, ["WHERE", "WHEN", "WHAT",
                    "HOW", "HOWTO"])) {
                    $generated_questions = self::parseWHPlusQuestion(
                        $tagged_question, 1);
                }
            }
        }
        return $generated_questions;
    }
    /**
     * Takes a triplets array with subject, predicate, object fields with
     * CONCISE, RAW subfields and produces a triplits with $type subfield (where
     * $type is one of CONCISE and RAW) and with subject, predicate, object,
     * and QUESTION_ANSWER_LIST subfields
     *
     * @param array $sub_pred_obj_triplets  in format described above
     * @param string $type either CONCISE or RAW
     * @return array $triplets in format described above
     */
    public static function extractTripletByType($sub_pred_obj_triplets, $type)
    {
        $triplets = [];
        if (!empty($sub_pred_obj_triplets['subject'][$type])
            && !empty($sub_pred_obj_triplets['predicate'][$type])
            && !empty($sub_pred_obj_triplets['object'][$type])) {
            $question_answer_triplets = [];
            $sentence = [ trim($sub_pred_obj_triplets['subject'][$type]),
                trim($sub_pred_obj_triplets['predicate'][$type]),
                trim($sub_pred_obj_triplets['object'][$type])];
            $question_triplets = [];
            for ($j = 0; $j < 2; $j++) {
                if ($j == 1 && in_array($sentence[1], ['is', 'was'])) {
                    $tmp = $sentence[2];
                    $sentence[2] = $sentence[0];
                    $sentence[0] = $tmp;
                } else if ($j == 1) {
                    break;
                }
                for ($i = 0; $i < 3; $i++) {
                    $q_sentence = $sentence;
                    $q_sentence[$i] = self::$question_token;
                    $q_sentence_string = implode(" ", $q_sentence);
                    $q_sentence_string = self::stemPhrase($q_sentence_string);
                    $question_triplets[] = $q_sentence_string;
                    $question_answer_triplets[$q_sentence_string] =
                        preg_replace('/\s+/u', ' ',$sentence[$i]);
                }
            }
            $triplets['QUESTION_LIST'] = $question_triplets;
            $triplets['QUESTION_ANSWER_LIST'] = $question_answer_triplets;
        }
        return $triplets;
    }
    /**
     * Given an English phrase produces a phrase where each of the terms has
     * been stemmed
     *
     * @param string $phrase phrase to stem
     * @return string in which each term has been stemmed according to the
     *      English stemmer
     */
    private static function stemPhrase($phrase)
    {
        $terms = mb_split("[[:space:]]", $phrase);
        if (empty($terms)) {
            return "";
        }
        $stemmed_phrase = "";
        $space = "";
        foreach ($terms as $term) {
            if (trim($term) == "") {
                    continue;
            }
            $stemmed_phrase .= $space . self::stem($term);
            $space = " ";
        }
        return $stemmed_phrase;
    }
    // private methods for stemming
    /**
     * Checks to see if the ith character in the buffer is a consonant
     *
     * @param int $i the character to check
     * @return if the ith character is a constant
     */
    private static function cons($i)
    {
        switch (self::$buffer[$i]) {
            case 'a':
                // no break
            case 'e':
            case 'i':
            case 'o':
            case 'u':
                return false;
            case 'y':
                return ($i== 0 ) ? true : !self::cons($i - 1);
            default:
                return true;
        }
    }
    /**
     * m() measures the number of consonant sequences between 0 and j. if c is
     * a consonant sequence and v a vowel sequence, and [.] indicates arbitrary
     * presence,
     * <pre>
     *   [c][v]       gives 0
     *   [c]vc[v]     gives 1
     *   [c]vcvc[v]   gives 2
     *   [c]vcvcvc[v] gives 3
     *   ....
     * </pre>
     */
    private static function m()
    {
        $n = 0;
        $i = 0;
        while(true) {
            if ($i > self::$j) return $n;
            if (!self::cons($i)) break;
            $i++;
        }
        $i++;
        while(true) {
            while(true) {
                if ($i > self::$j) return $n;
                if (self::cons($i)) break;
                $i++;
            }
            $i++;
            $n++;

            while(true)
            {
                if ($i > self::$j) return $n;
                if (!self::cons($i)) break;
                $i++;
            }
            $i++;
        }
    }
    /**
     * Checks if 0,...$j contains a vowel
     *
     * @return bool whether it does not
     */
    private static function vowelinstem()
    {
        for ($i = 0; $i <= self::$j; $i++) {
            if (!self::cons($i)) return true;
        }
        return false;
    }
    /**
     * Checks if $j,($j-1) contain a double consonant.
     *
     * @param int $j position to check in buffer for double consonant
     * @return bool if it does or not
     */
    private static function doublec($j)
    {
        if ($j < 1) { return false; }
        if (self::$buffer[$j] != self::$buffer[$j - 1]) { return false; }
        return self::cons($j);
    }
    /**
     * Checks whether the letters at the indices $i-2, $i-1, $i in the buffer
     * have the form consonant - vowel - consonant and also if the second c is
     * not w,x or y. this is used when trying to restore an e at the end of a
     * short word. e.g.
     *<pre>
     *   cav(e), lov(e), hop(e), crim(e), but
     *   snow, box, tray.
     *</pre>
     * @param int $i position to check in buffer for consonant-vowel-consonant
     * @return bool whether the letters at indices have the given form
     */
    private static function cvc($i)
    {
        if ($i < 2 || !self::cons($i) || self::cons($i - 1) ||
            !self::cons($i - 2)) return false;
        $ch = self::$buffer[$i];
        if ($ch == 'w' || $ch == 'x' || $ch == 'y') return false;
        return true;
    }
    /**
     * Checks if the buffer currently ends with the string $s
     *
     * @param string $s string to use for check
     * @return bool whether buffer currently ends with $s
     */
    private static function ends($s)
    {
        $len = strlen($s);
        $loc = self::$k - $len + 1;
        if ($loc < 0 ||
            substr_compare(self::$buffer, $s, $loc, $len) != 0) {
            return false;
        }
        self::$j = self::$k - $len;
        return true;
    }
    /**
     * setto($s) sets (j+1),...k to the characters in the string $s,
     * readjusting k.
     *
     * @param string $s string to modify the end of buffer with
     */
    private static function setto($s)
    {
        $len = strlen($s);
        $loc = self::$j + 1;
        self::$buffer = substr_replace(self::$buffer, $s, $loc, $len);
        self::$k = self::$j + $len;
    }
    /**
     * Sets the ending in the buffer to $s if the number of consonant sequences
     * between $k and $j is positive.
     *
     * @param string $s what to change the suffix to
     */
    private static function r($s)
    {
        if (self::m() > 0) self::setto($s);
    }

    /** step1ab() gets rid of plurals and -ed or -ing. e.g.
     * <pre>
     *    caresses  ->  caress
     *    ponies    ->  poni
     *    ties      ->  ti
     *    caress    ->  caress
     *    cats      ->  cat
     *
     *    feed      ->  feed
     *    agreed    ->  agree
     *    disabled  ->  disable
     *
     *    matting   ->  mat
     *    mating    ->  mate
     *    meeting   ->  meet
     *    milling   ->  mill
     *    messing   ->  mess
     *
     *    meetings  ->  meet
     * </pre>
     */
    private static function step1ab()
    {
        if (self::$buffer[self::$k] == 's') {
            if (self::ends("sses")) {
                self::$k -= 2;
            } else if (self::ends("ies")) {
                self::setto("i");
            } else if (self::$buffer[self::$k - 1] != 's') {
                self::$k--;
            }
        }
        if (self::ends("eed")) {
            if (self::m() > 0) self::$k--;
        } else if ((self::ends("ed") || self::ends("ing")) &&
            self::vowelinstem()) {
            self::$k = self::$j;
            if (self::ends("at")) {
                self::setto("ate");
            } else if (self::ends("bl")) {
                self::setto("ble");
            } else if (self::ends("iz")) {
                self::setto("ize");
            } else if (self::doublec(self::$k)) {
                self::$k--;
                $ch = self::$buffer[self::$k];
                if ($ch == 'l' || $ch == 's' || $ch == 'z') self::$k++;
            } else if (self::m() == 1 && self::cvc(self::$k)) {
                self::setto("e");
            }
       }
    }
    /**
     * step1c() turns terminal y to i when there is another vowel in the stem.
     */
    private static function step1c()
    {
        if (self::ends("y") && self::vowelinstem()) {
            self::$buffer[self::$k] = 'i';
        }
    }
    /**
     * step2() maps double suffices to single ones. so -ization ( = -ize plus
     * -ation) maps to -ize etc.Note that the string before the suffix must
     * give m() > 0.
     */
    private static function step2()
    {
        if (self::$k < 1) return;
        switch (self::$buffer[self::$k - 1]) {
            case 'a':
                if (self::ends("ational")) { self::r("ate"); break; }
                if (self::ends("tional")) { self::r("tion"); break; }
                break;
            case 'c':
                if (self::ends("enci")) { self::r("ence"); break; }
                if (self::ends("anci")) { self::r("ance"); break; }
                break;
            case 'e':
                if (self::ends("izer")) { self::r("ize"); break; }
                break;
            case 'l':
                if (self::ends("abli")) { self::r("able"); break; }
                if (self::ends("alli")) { self::r("al"); break; }
                if (self::ends("entli")) { self::r("ent"); break; }
                if (self::ends("eli")) { self::r("e"); break; }
                if (self::ends("ousli")) { self::r("ous"); break; }
                break;
            case 'o':
                if (self::ends("ization")) { self::r("ize"); break; }
                if (self::ends("ation")) { self::r("ate"); break; }
                if (self::ends("ator")) { self::r("ate"); break; }
                break;
            case 's':
                if (self::ends("alism")) { self::r("al"); break; }
                if (self::ends("iveness")) { self::r("ive"); break; }
                if (self::ends("fulness")) { self::r("ful"); break; }
                if (self::ends("ousness")) { self::r("ous"); break; }
                break;
            case 't':
                if (self::ends("aliti")) { self::r("al"); break; }
                if (self::ends("iviti")) { self::r("ive"); break; }
                if (self::ends("biliti")) { self::r("ble"); break; }
                break;
        }
    }
    /**
     * step3() deals with -ic-, -full, -ness etc. similar strategy to step2.
     */
    private static function step3()
    {
        switch (self::$buffer[self::$k]) {
            case 'e':
                if (self::ends("icate")) { self::r("ic"); break; }
                if (self::ends("ative")) { self::r(""); break; }
                if (self::ends("alize")) { self::r("al"); break; }
                break;
            case 'i':
                if (self::ends("iciti")) { self::r("ic"); break; }
                break;
            case 'l':
                if (self::ends("ical")) { self::r("ic"); break; }
                if (self::ends("ful")) { self::r(""); break; }
                break;
            case 's':
                if (self::ends("ness")) { self::r(""); break; }
                break;
        }
    }
    /**
     * step4() takes off -ant, -ence etc., in context <c>vcvc<v>.
     */
    private static function step4()
    {
        if (self::$k < 1) { return; }
        switch (self::$buffer[self::$k - 1]) {
            case 'a':
                if (self::ends("al")) { break; }
                return;
            case 'c':
                if (self::ends("ance")) { break; }
                if (self::ends("ence")) { break; }
                return;
            case 'e':
                if (self::ends("er")) break;
                return;
            case 'i':
                if (self::ends("ic")) break;
                return;
            case 'l':
                if (self::ends("able")) break;
                if (self::ends("ible")) break;
                return;
            case 'n':
                if (self::ends("ant")) break;
                if (self::ends("ement")) break;
                if (self::ends("ment")) break;
                if (self::ends("ent")) break;
                return;
            case 'o':
                if (self::ends("ion") && self::$j >= 0 &&
                    (self::$buffer[self::$j] == 's' ||
                    self::$buffer[self::$j] == 't')) break;
                if (self::ends("ou")) break;
                return;
            /* takes care of -ous */
            case 's':
                if (self::ends("ism")) break;
                return;
            case 't':
                if (self::ends("ate")) break;
                if (self::ends("iti")) break;
                    return;
            case 'u':
                if (self::ends("ous")) break;
                return;
            case 'v':
                if (self::ends("ive")) break;
                return;
            case 'z':
                if (self::ends("ize")) break;
                return;
            default:
                return;
        }
        if (self::m() > 1) self::$k = self::$j;
    }
    /** step5() removes a final -e if m() > 1, and changes -ll to -l if
     * m() > 1.
     */
    private static function step5()
    {
        self::$j = self::$k;

        if (self::$buffer[self::$k] == 'e') {
            $a = self::m();
            if ($a > 1 || $a == 1 && !self::cvc(self::$k - 1)) self::$k--;
        }
        if (self::$buffer[self::$k] == 'l' &&
            self::doublec(self::$k) && self::m() > 1) self::$k--;
    }
    //private methods for part of speech tagging
    /**
     * Takes an array of pairs (token, tag) that came from phrase
     * and builds a new phrase where terms look like token~tag.
     *
     * @param array $tagged_tokens array pairs as might come from tagTokenize
     * @param bool $with_tokens whether to include the terms and the tags
     *      in the output string or just the part of speech tags
     * @return string $tagged_phrase a phrase with terms in the format token~tag
     *      ($with_token == true) or space separated tags (!$with_token).
     */
    private static function taggedPartOfSpeechTokensToString($tagged_tokens,
        $with_tokens = true)
    {
        $tagged_phrase = "";
        $simplified_parts_of_speech = [
            "NN" => "NN", "NNS" => "NN", "NNP" => "NN", "NNPS" => "NN",
            "PRP" => "NN", 'PRP$' => "NN", "WP" => "NN",
            "VB" => "VB", "VBD" => "VB", "VBN" => "VB", "VBP" => "VB",
            "VBZ" => "VB",
            "JJ" => "AJ", "JJR" => "AJ", "JJS" => "AJ",
            "RB" => "AV", "RBR" => "AV", "RBS" => "AV", "WRB" => "AV"
        ];
        foreach ($tagged_tokens as $t) {
            $tag = trim($t['tag']);
            $tag = (isset($simplified_parts_of_speech[$tag])) ?
                $simplified_parts_of_speech[$tag] : $tag;
            $token = ($with_tokens) ? $t['token'] . "~" : "";
            $tagged_phrase .= $token . $tag .  " ";
        }
        return $tagged_phrase;
    }
}
ViewGit