Last commit for src/library/PhraseParser.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\library;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\models\LocaleModel;
use seekquarry\yioop\library\processors\PageProcessor;

/**
 * For crawlHash
 */
require_once __DIR__ . "/Utility.php";
/**
 * So know which part of speech tagger to use
 */
require_once __DIR__ . "/LocaleFunctions.php";
/**
 * Library of functions used to manipulate words and phrases
 *
 * @author Chris Pollett
 */
class PhraseParser
{
    /**
     * A list of meta words that might be extracted from a query
     * @var array
     */
    public static $meta_words_list = ['\-i:', '\-index:',  '\-', 'class:',
        'class-score:', 'cld:', 'code:', 'color:', 'date:', 'dns:', 'duration:',
        'filetype:', 'guid:', 'hash:', 'host:', 'i:', 'info:', 'index:', 'ip:',
        'link:',  'lang:', 'layout:', 'location:', 'media:', 'modified:',
        'numlinks:', 'os:', 'path:', 'pubdate:', 'robot:', 'safe:', 'server:',
        'site:', 'size:', 'time:', 'u:', 'version:','weight:', 'w:'
        ];
    /**
     * A list of meta words that might be extracted from a query
     * @var array
     */
    public static $programming_language_map = ['java' => 'java',
            'py' => 'python'];
    /**
     *  Tokenizer objects that have been loaded so far
     *  @var array
     */
    public static $tokenizers = [];
    /**
     * Constant storing the string
     */
    const TOKENIZER = 'Tokenizer';
    /**
     * Indicates the control word for programming languages
     */
    const CONTROL_WORD_INDICATOR = ':';
    /**
     * Indicates the control word for programming languages
     */
    const REGEX_INITIAL_POSITION = 1;
    /**
     * Threshold to use for a string to be conisdered "safe" (not X-rated)
     */
    const SAFE_PHRASE_THRESHOLD = 0.035;
    /**
     * Converts a summary of a web page into a string of space separated words
     *
     * @param array $page associative array of page summary data. Contains
     *     title, description, and links fields
     * @return string the concatenated words extracted from the page summary
     */
    public static function extractWordStringPageSummary($page)
    {
        if (isset($page[CrawlConstants::TITLE])) {
            $title_phrase_string = mb_ereg_replace(C\PUNCT, " ",
                $page[CrawlConstants::TITLE]);
        } else {
            $title_phrase_string = "";
        }
        if (isset($page[CrawlConstants::DESCRIPTION])) {
            $description_phrase_string = mb_ereg_replace(C\PUNCT, " ",
                $page[CrawlConstants::DESCRIPTION]);
        } else {
            $description_phrase_string = "";
        }
        $page_string = $title_phrase_string . " " . $description_phrase_string;
        $page_string = preg_replace("/(\s)+/", " ", $page_string);

        return $page_string;
    }
    /**
     * Extracts all phrases (sequences of adjacent words) from $string. Does
     * not extract terms within those phrase. Array key indicates position
     * of phrase
     *
     * @param string $string subject to extract phrases from
     * @param string $lang locale tag for stemming
     * @param string $index_name name of index to be used as a reference
     *     when extracting phrases
     * @param bool $exact_match whether the match has to be exact or not
     * @param int $threshold roughly causes a stop to extracting more phrases
     *  if exceed $threshold (still might get more than $threshold back, only
     *  when detect have more stop)
     * @return array of phrases
     */
    public static function extractPhrases($string, $lang = null,
        $index_name = null, $exact_match = false, $threshold =
        C\MIN_RESULTS_TO_GROUP)
    {
        $index_name = (empty($index_name) || is_integer($index_name)
            || $index_name[0] != '-') ?
            $index_name : substr($index_name, 1);
        $char_class = C\NS_LOCALE . $lang . "\\resources\\Tokenizer";
        if (isset(self::$programming_language_map[$lang])) {
            $control_word = self::$programming_language_map[$lang] .
                self::CONTROL_WORD_INDICATOR;
            $string = trim(substr($string, strlen($control_word) + 1));
        } else {
            self::canonicalizePunctuatedTerms($string, $lang);
            self::hyphenateEntities($string, $lang);
        }
        $terms = self::stemCharGramSegment($string, $lang);
        $num_terms = count($terms);
        if ($index_name == null || $num_terms <= 1 ||
            (class_exists($char_class) && isset($char_class::$char_gram_len))) {
            return $terms;
        }
        // keep only first C\MAX_QUERY_TERMS many terms
        if ($num_terms > C\MAX_QUERY_TERMS) {
            $terms = array_slice($terms, 0, C\MAX_QUERY_TERMS);
        }
        $whole_phrase = implode(" ", $terms);
        if ($exact_match || ($index_name != 'feed' &&
            IndexManager::getVersion($index_name) == 0)) {
            /* for exact phrase search do not use suffix tree stuff for now.
               Also, for old style index before max phrase extraction
               just return terms
            */
            return $terms;
        }
        $tokenizer = self::getTokenizer($lang);
        // query terms are question answer triplet then do no further processing
        if (!empty($tokenizer::$question_token) &&
            stristr($whole_phrase, $tokenizer::$question_token) !== false) {
            return [$whole_phrase];
        }
        return $terms;
    }
    /**
     * Extracts all phrases (sequences of adjacent words) from $string. Does
     * not extract terms within those phrase. Returns an associative array
     * of phrase => number of occurrences of phrase
     *
     * @param string $string subject to extract phrases from
     * @param string $lang locale tag for stemming
     * @return array pairs of the form (phrase, number of occurrences)
     */
    public static function extractPhrasesAndCount($string, $lang = null)
    {
        $all_lists = self::extractPhrasesInLists($string, $lang);
        $phrases = $all_lists['WORD_LIST'];
        $phrase_counts = [];
        foreach ($phrases as $term => $positions) {
            $phrase_counts[$term] = count($positions);
        }
        return $phrase_counts;
    }
    /**
     * Extracts all phrases (sequences of adjacent words) from $string. Does
     * extract terms within those phrase.
     *
     * @param string $string subject to extract phrases from
     * @param string $lang locale tag for stemming and other phrase processing
     *      related stuff
     * @return array word => list of positions at which the word occurred in
     *     the document
     */
    public static function extractPhrasesInLists($string, $lang = null)
    {
        $start_time = microtime(true);
        $phrase_list = ['TIMES' => [ 'CANONICALIZE' => 0,
            'TERM_POSITIONS_SENTENCE_TAGGING' => 0,
            'QUESTION_ANSWER_EXTRACT' => 0,
            'TOTAL_TIME' => 0]];
        if (!isset(self::$programming_language_map[$lang])) {
            self::canonicalizePunctuatedTerms($string, $lang);
            self::hyphenateEntities($string, $lang);
            $phrase_list['TIMES']['CANONICALIZE'] =
                changeInMicrotime($start_time);
        }
        $maximal_terms_start_time = microtime(true);
        $phrase_and_sentences = self::extractTermSentencePositionsTags(
            $string, $lang, C\ENABLE_QUESTION_ANSWERING);
        if (empty($phrase_and_sentences["TERM_POSITIONS"])) {
            $phrase_and_sentences["TERM_POSITIONS"] = [];
        }
        $phrase_list['TIMES']['TERM_POSITIONS_SENTENCE_TAGGING'] =
            changeInMicrotime($maximal_terms_start_time);
        if (C\ENABLE_QUESTION_ANSWERING &&
            !empty($phrase_and_sentences["SENTENCES"])) {
            $qa_start_time = microtime(true);
            $tokenizer = self::getTokenizer($lang);
            if (!empty($tokenizer) &&
                method_exists($tokenizer, "tagTokenizePartOfSpeech") &&
                method_exists($tokenizer, "extractTripletsPhrases")) {
                $triplets_list = $tokenizer->extractTripletsPhrases(
                    $phrase_and_sentences["SENTENCES"], $lang);
                if (!empty($triplets_list['QUESTION_LIST'])) {
                    $phrase_and_sentences["TERM_POSITIONS"] =
                        $phrase_and_sentences["TERM_POSITIONS"] +
                       $triplets_list['QUESTION_LIST'];
                    $phrase_list['QUESTION_ANSWER_LIST'] =
                        $triplets_list['QUESTION_ANSWER_LIST'];
                }
                $phrase_list['TIMES']['QUESTION_ANSWER_EXTRACT'] =
                    changeInMicrotime($qa_start_time);
            }
        }
        $phrase_list['WORD_LIST'] = $phrase_and_sentences["TERM_POSITIONS"];
        $phrase_list['TIMES']['TOTAL_TIME'] = changeInMicrotime($start_time);
        return $phrase_list;
    }
    /**
     * Extracts from a $string an associative array of terms and position
     * within $string of those terms
     *
     * @param string $string text to extract terms and their positions from
     * @param string $lang locale of text
     * @return array associative array of terms and positions
     */
    public static function extractTermPositions($string, $lang)
    {
        self::canonicalizePunctuatedTerms($string, $lang);
        self::hyphenateEntities($string, $lang);
        $pos_lists = [];
        $terms = self::stemCharGramSegment($string, $lang);
        if (empty($terms)) {
            return [];
        }
        $t = 1; /*first position in doc is 1 as will encode with modified9
             which requires positive numbers
        */
        if (strpos($string ?? "", "-") === false) {
            foreach ($terms as $term) {
                $pos_lists[$term][] = $t++;
            }
        } else {
            // add all single terms in entity
            foreach ($terms as $term) {
                $pos_lists[$term][] = $t;
                /* this is to allow for searching by entities and parts
                   of entities
                 */
                $term_parts = explode("-", $term);
                array_shift($term_parts);
                foreach($term_parts as $part) {
                    $pos_lists[$part][] = $t;
                }
                $t++;
            }
        }
        return $pos_lists;
    }
    /**
     * This method tries to convert acronyms, e-mail, urls, etc into
     * a format that does not involved punctuation that will be stripped
     * as we extract phrases.
     *
     * @param string &$string a string of words, etc which might involve such
     *      terms
     * @param $lang a language tag to use as part of the canonicalization
     *     process not used right now
     */
    public static function canonicalizePunctuatedTerms(&$string, $lang = null)
    {
        $string = (preg_replace("/\s*\&(?:apos|#039)\;\s*/u", "'",
            ($string ?? ""))) ?? "";
        $acronym_pattern = "/\b\p{L}(\.\s*\p{L})+(\.|\b)/u";
        $string = preg_replace_callback($acronym_pattern,
            function($matches) {
                $result = "_" . preg_replace("/\.\s*/u", "", $matches[0]);
                return $result;
            }, $string) ?? "";
        $ap = "(\'|\u{2020}|\u{02BC})";
        $ampersand_pattern = "/\p{L}+".
            "(\s*(\s({$ap}n|{$ap}N)\s|\&)\s*\p{L})+/u";
        $string = preg_replace_callback($ampersand_pattern,
            function($matches) {
                $ap = "(\'|\u{2020}|\u{02BC})";
                $result = preg_replace(
                    "/\s*(" . $ap . "n\b|" . $ap . "N\b|\&)\s*/u",
                    "_and_", $matches[0]);
                return $result;
            }, $string) ?? "";
        $string = preg_replace("/\s*_and_amp;\s*/", "_and_", $string) ?? "";
        $url_or_email_pattern =
            '@((gopher|http|https)://([^ \t\r\n\v\f\'\"\;\,<>])*)|'.
            '([A-Z0-9._%-]+\@[A-Z0-9.-]+\.[A-Z]{2,4})@i';
        $string = preg_replace_callback($url_or_email_pattern,
            function($matches) {
                return preg_replace(['/\./', "/\:/", "/\//", "/\@/",
                    "/\[/", "/\]/", "/\(/", "/\)/", "/\?/", "/\=/", "/\&/"],
                    ["_d_", "_c_", "_s_", "_at_", "_bo_", "_bc_", "_po_",
                    "_pc_", "_q_", "_e_", "_and_"], $matches[0]);
            },
            $string) ?? "";
            $tokenizer = self::getTokenizer($lang);
            if (!empty($tokenizer) &&
                method_exists($tokenizer, "canonicalizePunctuatedTerms")) {
                $tokenizer->canonicalizePunctuatedTerms($string);
            }
    }
    /**
     * Given a string, hyphenates words in the string which appear in
     * a bloom filter for the given locale as phrases.
     *
     * @param string &$string a string of words, etc which might involve such
     *      terms
     * @param $lang a language tag to use as part of the canonicalization
     *     process
     */
    public static function hyphenateEntities(&$string, $lang = null)
    {
        if (!$lang) {
            return;
        }
        $parts = preg_split("/\s+/u", $string);
        $parts = array_filter($parts);
        $num_parts = count($parts);
        $current_entity = "";
        $lower_entity = "";
        $out_string = "";
        $space = "";
        $i = 0;
        $j = -1;
        $k = 0;
        while ($j < $num_parts) {
            $j++;
            $current_entity = trim(implode(" ",
                array_slice($parts, $i, $j - $i)));
            $lower_entity = mb_strtolower($current_entity);
            if ($j - $i > 1) {
                $contains = false;
                if (NWordGrams::ngramsContains(
                    $lower_entity, $lang, "all")) {
                    $last_entity = $current_entity;
                    $lower_last_entity = $lower_entity;
                    $k = $j;
                    $contains = true;
                }
                if (!NWordGrams::ngramsContains(
                    $lower_entity . "*", $lang, "all")) {
                    // extra checks as Bloom filter not 100%
                    if (strpos(substr($last_entity, 4), " ") > 0 &&
                        !preg_match('/\(|\)|\[|\]|,/', $last_entity) &&
                        NWordGrams::ngramsContains($lower_last_entity, $lang,
                        "all")) {
                        $last_entity = str_replace(" ", "-", $last_entity);
                    }
                    $out_string .= $space . $last_entity;
                    $space = " ";
                    $current_entity = "";
                    $last_entity = "";
                    $lower_last_entity = "";
                    $i = $k;
                    $j = $k - 1;
                }
            } else {
                $contains = false;
                $last_entity = $current_entity;
                $lower_last_entity = $lower_entity;
                $k = $j;
            }
        }
        if ($contains && strpos(trim($current_entity), " ") > 0 &&
            !preg_match('/\-|\(|\)|\[|\]|,|\./', $current_entity)) {
            $current_entity = str_replace(" ", "-", $current_entity);
        }
        $string = $out_string . " " . $current_entity;
    }
    /**
     * Splits string according to punctuation and white space then
     * extracts (stems/char grams) of terms and makes a position. Then
     * splits string according to senttences and make a position list for
     * sentences
     *
     * @param string $string to extract terms from
     * @param string $lang IANA tag to look up stemmer under
     * @param boolean $extract_sentences whether to extract sentences to
     *  be used by question answering system
     * @return array of terms and n word grams in the order they appeared in
     *     string
     */
    public static function extractTermSentencePositionsTags($string,
        $lang = null, $extract_sentences = false)
    {
        $pos_lists = [];
        $terms = self::stemCharGramSegment($string, $lang);
        if (empty($terms)) {
            return [];
        }
        $t = 1; /*first position in doc is 1 as will encode with modified9
             which requires positive numbers
        */
        if (strpos($string ?? "", "-") === false) {
            foreach ($terms as $term) {
                $pos_lists[$term][] = $t++;
            }
        } else {
            // add all single terms in entity
            foreach ($terms as $term) {
                $pos_lists[$term][] = $t;
                /* this is to allow for searching by entities and parts
                   of entities
                 */

                $term_parts = explode("-", $term ?? "");
                array_shift($term_parts);
                foreach($term_parts as $part) {
                    $pos_lists[$part][] = $t;
                }
                $t++;
            }
        }
        $out["TERM_POSITIONS"] = $pos_lists;
        $tokenizer = self::getTokenizer($lang);
        if ($extract_sentences && !empty($tokenizer) &&
            method_exists($tokenizer, "tagTokenizePartOfSpeech") &&
            !isset(self::$programming_language_map[$lang])) {
            $string = 'Zdummy. ' . mb_strtolower($string);
            $sentences = preg_split(
                '/\s*((\n\n+)|\.[\D]|\.$|\!|\?|。|!|?)\s*/u', $string, -1,
                PREG_SPLIT_OFFSET_CAPTURE);
            $sentences_pos = [];
            foreach ($sentences as $sentence_data) {
                list($sentence, $pos) = $sentence_data;
                $sentences_pos[$sentence][] = $pos;
            }
            unset($sentences_pos[""], $sentences_pos["Zdummy"]);
            $out["SENTENCES"] = $sentences_pos;
        }
        return $out;
    }
    /**
     * Given a string splits it into terms by running any applicable
     * segmenters, chargrammers, or stemmers of the given locale
     *
     * @param string $string what to extract terms from
     * @param string $lang locale tag to determine which stemmers, chargramming
     *     and segmentation needs to be done.
     * @param bool $to_string if the result should be imploded on space to
     *      a single string or left as an array of terms
     *
     * @return mixed either an array of the terms computed from the string
     *  or a string where this array has been imploded on space
     */
    public static function stemCharGramSegment($string, $lang,
        $to_string = false)
    {
        static $non_hyphens = "";
        static $segment_char_gram_lang = [];
        if (empty($non_hyphens)) {
            $non_hyphens = str_replace("-|", "", C\PUNCT);
            $segment_char_gram_lang = ['zh-CN', "ko", "ja", 'bn', 'he', 'hi',
                'id', 'kn', 'pl', 'te', 'th', 'tl'];
        }
        if (isset(self::$programming_language_map[$lang])) {
            mb_internal_encoding("UTF-8");
            $tokenizer_name = self::$programming_language_map[$lang] .
                self::TOKENIZER;
            $terms = self::$tokenizer_name($string, $lang);
        } else {
            mb_internal_encoding("UTF-8");
            $string = mb_strtolower($string);
            if ($lang == "hi") {
                $string = preg_replace('/(,:)\p{P}/u', "", $string);
            }
            $string = mb_ereg_replace("\s+|$non_hyphens", " ", $string);
            $tokenizer = self::getTokenizer($lang);
            $terms = $string;
            if (in_array($lang, $segment_char_gram_lang)) {
                $terms = self::segmentSegment($terms, $lang);
                $terms = self::charGramTerms($terms, $lang);
            }
            $terms = self::stemTerms($terms, $lang);
        }
        if ($to_string) {
            return implode(" ", $terms);
        }
        return $terms;
    }
    /**
     * Given a string tokenizes into Java tokens
     *
     * @param string $string what to extract terms from
     * @param string $lang indicates programming language
     *
     * @return array the terms computed from the string
     */
    public static function javaTokenizer($string, $lang)
    {
        //Comments
        $single_line_comments = "(\/\/).*?(\n)";
        $multiline_comments = "\\/\\*[^(\\/\\*)]*?\\*\\/";
        $javadoc_comments = "\/\*([^*]|[\r\n]|(\*+([^*\/]|[\r\n])))*\*+\/";
        $multiple_line_comments = "$javadoc_comments|$multiline_comments";
        $comments = "($multiple_line_comments|$single_line_comments)";
        //Identifiers
        $alphabetic = "[A-Za-z]";
        $id_start = "($alphabetic)|\\".'$'."|\_";
        $numeric = "[0-9]";
        $repeat = "$id_start|$numeric";
        $identifiers = "($id_start)($repeat)*";
        //Keywords
        $keywords_part1 = "abstract|assert|boolean|break|byte|case|catch|char";
        $keywords_part2 =
            "class|const|continue|default|do|double|else|extends";
        $keywords_part3 = "final|finally|float|for|goto|if|implements|import";
        $keywords_part4 = "instanceof|int|interface|long|native|new|package";
        $keywords_part5 = "private|protected|public|return|short|static";
        $keywords_part6 = "strictfp|super|synchronized|switch|this|throw";
        $keywords_part7 = "throws|transient|try|void|volatile|while";
        $keywords_string1 = "$keywords_part1|$keywords_part2|$keywords_part3";
        $keywords_string2 = "$keywords_part4|$keywords_part5|$keywords_part6";
        $keywords_string3 = "$keywords_part7";
        $keywords = "($keywords_string1|$keywords_string2|$keywords_string3)";
        //Separators
        $separators = "(;|,|\.|\(|\)|\{|\}|\[|\])";
        //Operators
        $operators_part1 = "\+|\-|\*|\/|&|\||\^|%|<<|>>|=|>|<|!|~|\?|:";
        $operators_part2 = "\-\-|\+\+|>>>|==|<=|>=|!=|&&|\|\|";
        $operators_part3 = "\+=|\-=|\*=|\/=|&=|\|=|\^=|%=|<<=|>>=|>>>=";
        $operators = "($operators_part3|$operators_part2|$operators_part1)";
        //Null Literal
        $null_literal = "null";
        //Boolean Literal
        $boolean_literal = "true|false";
        //Floating point Literal
        $non_zero_digit = "1|2|3|4|5|6|7|8|9";
        $digit = "0|$non_zero_digit";
        $digits = "($digit)($digit)*";
        $exponent_part = "(e|E)([\+|\-])($digits)";
        $float_part1 = "($digits)($exponent_part)";
        $float_part2 = "($digits)(\.)($digits)?($exponent_part)?";
        $float_part3 = "(\.$digits)($exponent_part)?";
        $floating_point_numeral = "$float_part1|$float_part2|$float_part3";
        //Integer Literal
        $decimal_numeral = "0|($non_zero_digit)($digits){0,1}";
        $hex_numeral = "0[x|X][0-9A-Fa-f]+";
        $octal_numeral = "0[0-7]+";
        $integer_numeral = "($hex_numeral|$octal_numeral|$decimal_numeral)";
        //Character Literal
        $special_part1 = "\!|%|\^|&|\*|\(|\)|\-|\+|\=|\{|\}|\||~|\[|\]|\\|;";
        $special_part2 = "'|\:|\<|\>|\?|,|\.|\/|#|@|`|_";
        $special = "$special_part1|$special_part2";
        $alphanumeric = "[A-Za-z0-9]";
        $graphic = "$alphanumeric|$special";
        $escape = "\\n|\\t|\\v|\\a|\\b|\\r|\\f|\\\\|\\'|\\\"";
        $char_literal = "(\'($graphic)\'|\'\s\'|\'($escape)\')";
        //String Literal
        $string_literal = "(\"($graphic|\s|$escape)*?[^\\\]\")";
        //Literals
        $literals_part1 = "$string_literal|$floating_point_numeral";
        $literals_part2 = "$integer_numeral|$char_literal|$boolean_literal";
        $literals_part3 = "$null_literal";
        $literals = "($literals_part1|$literals_part2|$literals_part3)";
        //Java Tokens
        $tokens_part1 = "$comments|$literals|$operators";
        $tokens_part2 = "$separators|$keywords|$identifiers";
        $tokens = "($tokens_part1|$tokens_part2)";
        $length = strlen($string);
        $current_length = $length;
        $position = self::REGEX_INITIAL_POSITION;
        $results = [];
        while($position == 1 && $current_length > 0) {
            $temp_results = [];
            $position = preg_match("/$tokens/", $string, $matches,
                PREG_OFFSET_CAPTURE);
            if (isset($matches[0][0])) {
                $text = preg_replace("/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/",
                    "\n", $matches[0][0]);
                $lines = explode("\n", trim($text));
                $line = implode(' ', $lines);
                $data = preg_replace("/[\t\s]+/", ' ', trim($line));
                $temp_results = explode(" ", trim($data));
                foreach ($temp_results as $result) {
                    if (!empty($result)) {
                        $results[] = self::$programming_language_map[$lang] .
                            self::CONTROL_WORD_INDICATOR . trim($result);
                    }
                }
                $current_length = (strlen($matches[0][0]));
                $string = trim(substr($string, $current_length, $length));
            }
        }
        return $results;
    }
    /**
     * Given a string tokenizes into Python tokens
     *
     * @param string $string what to extract terms from
     * @param string $lang indicates programming language
     *
     * @return array the terms computed from the string
     */
    public static function pythonTokenizer($string, $lang)
    {
        //Comments
        $ordinary_part1 = "_|\(|\)|\[|\]|\{|\}|\+|\-|\*|\/|%";
        $ordinary_part2 = "\!|&|\||\^|~|\<|\=|\>|,|\.|\:|;|$|\?|#|\@";
        $ordinary = "$ordinary_part1|$ordinary_part2";
        $lower = "a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z";
        $upper = "A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z";
        $digit = "0|1|2|3|4|5|6|7|8|9";
        $graphic = "$lower|$upper|$digit|$ordinary";
        $text_chars = "$graphic|\s|\"|\'";
        $comments = "#($text_chars|\\\)*?(\n)";
        //Identifiers
        $id_start = "$lower|$upper|\_";
        $repeat = "$id_start|$digit";
        $identifiers = "($id_start)($repeat)*";
        //Keywords
        $keywords_part1 = "False|None|True|and|as|assert|break|class|continue";
        $keywords_part2 = "finally|for|from|global|elif|import|in|def|del|if";
        $keywords_part3 = "is|lambda|nonlocal|not|or|pass|raise|else|except";
        $keywords_part4 = "return|try|while|with|yield";
        $keywords_string1 = "$keywords_part1|$keywords_part2";
        $keywords_string2 = "$keywords_part3|$keywords_part4";
        $keywords = "($keywords_string1|$keywords_string2)";
        //Operators
        $operators_part1 = "is|in|or|not|and|\+|\-|\*|\/|%|<|>|&";
        $operators_part2 = "\*\*|==|!=|<=|>=|\/\/";
        $operators_part3 = "<<|>>|\^|~|\|";
        $operators = "($operators_part3|$operators_part2|$operators_part1)";
        //Delimiters
        $delimiters_part1 = "\.|,|:|;|@|=|\(|\)|\{|\}|\[|\]";
        $delimiters_part2 = "\+=|\-=|\*=|\/=|\/\/=|%=|\*\*=";
        $delimiters_part3 = "&=|\|=|\^=|<<=|>>=";
        $delimiters = "$delimiters_part3|$delimiters_part2|$delimiters_part1";
        //Floating point Literal
        $digits = "($digit)($digit)*";
        $mantissa = "($digits)\.($digit)*|\.($digits)";
        $exponent = "e[\+|\-]$digits|E[\+|\-]$digits";
        $float_literal = "($mantissa)($exponent)*|($digits)($exponent)";
        //Integer Literal
        $non_zero_digit = "1|2|3|4|5|6|7|8|9";
        $binary_digit = "0|1";
        $octal_digit = "0|1|2|3|4|5|6|7";
        $hex_digit = "$digit|a|b|c|d|e|f|A|B|C|D|E|F";
        $decimal_literal = "0+|($non_zero_digit)($digit)*";
        $binary_literal_part1 = "0b($binary_digit)($binary_digit)*";
        $binary_literal_part2 = "0B($binary_digit)($binary_digit)*";
        $binary_literal = "$binary_literal_part1|$binary_literal_part2";
        $octal_literal_part1 = "0O($octal_digit)($octal_digit)*";
        $octal_literal_part2 = "0o($octal_digit)($octal_digit)*";
        $octal_literal = "$octal_literal_part1|$octal_literal_part2";
        $hex_literal_part1 ="0X($hex_digit)($hex_digit)*";
        $hex_literal_part2 ="0x($hex_digit)($hex_digit)*";
        $hex_literal = "$hex_literal_part1|$hex_literal_part2";
        $integer_literal_part1 = "($binary_literal)|($octal_literal)";
        $integer_literal_part2 = "($hex_literal)|($decimal_literal)";
        $integer_literal = "$integer_literal_part1|$integer_literal_part2";
        //Boolean Literal
        $boolean_literal = "True|False";
        //None Type Literal
        $none_literal = "None";
        //String Literal
        $esc_a = "\\\o[$octal_digit]{3}|\\\h[$hex_digit]{2}|\\\[$text_chars]";
        $unicode = "[^\\x00-\\x80]+";
        $esc_u = "$esc_a|\\\n$unicode|\\\u[$hex_digit]{4}|\\\U[$hex_digit]{8}";
        $raw_opt = "r|R";
        $bytes_opt = "b|B";
        $single_quoted_element1 = "($graphic|$esc_u|\\s|\\t|\')*";
        $single_quoted_element2 = "($graphic|$esc_u|\\s|\\t|\")*";
        $single_quoted_string1 = "(\"$single_quoted_element1\")";
        $single_quoted_string2 = "(\'$single_quoted_element2\')";
        $single_quoted_string =
            "$single_quoted_string1|$single_quoted_string2";
        $triple_quoted_element = "$text_chars|$esc_u";
        $triple_quoted_string1 = "(\"\"\"($triple_quoted_element)*?\"\"\")";
        $triple_quoted_string2 = "(\'\'\'($triple_quoted_element)*?\'\'\')";
        $triple_quoted_string =
            "$triple_quoted_string1|$triple_quoted_string2";
        $string_literal_part1 = "($raw_opt)?($triple_quoted_string)";
        $string_literal_part2 = "($raw_opt)?($single_quoted_string)";
        $string_literal = "$string_literal_part1|$string_literal_part2";
        //Byte Literal
        $single_quoted_element3 = "($graphic|$esc_a|\\s|\\t|\')*";
        $single_quoted_element4 = "($graphic|$esc_a|\\s|\\t|\")*";
        $single_quoted_byte1 = "(\"$single_quoted_element3\")";
        $single_quoted_byte2 = "(\'$single_quoted_element4\')";
        $single_quoted_byte = "$single_quoted_byte1|$single_quoted_byte2";
        $triple_quoted_byte1 = "(\"\"\"($triple_quoted_element)*?\"\"\")";
        $triple_quoted_byte2 = "(\'\'\'($triple_quoted_element)*?\'\'\')";
        $triple_quoted_byte = "$triple_quoted_byte1|$triple_quoted_byte2";
        $bytes_literal_part1 = "($bytes_opt)($raw_opt)?($triple_quoted_byte)";
        $bytes_literal_part2 = "($bytes_opt)($raw_opt)?($single_quoted_byte)";
        $bytes_literal = "$bytes_literal_part1|$bytes_literal_part2";
        //Literals
        $literals_part1 = "$string_literal|$bytes_literal|$float_literal";
        $literals_part2 = "$integer_literal|$boolean_literal|$none_literal";
        $literals = "($literals_part1|$literals_part2)";
        //Python Tokens
        $tokens_part1 = "$comments|$literals|$delimiters";
        $tokens_part2 = "$operators|$keywords|$identifiers";
        $tokens = "($tokens_part1|$tokens_part2)";
        $length = strlen($string);
        $current_length = $length;
        $position = self::REGEX_INITIAL_POSITION;
        $results = [];
        while($position == 1 && $current_length > 0) {
            $temp_results = [];
            $position = preg_match("/$tokens/", $string, $matches,
                PREG_OFFSET_CAPTURE);
            if (isset($matches[0][0])) {
                $text = preg_replace("/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/",
                    "\n", $matches[0][0]);
                $lines = explode("\n", trim($text));
                $line = implode(' ', $lines);
                $data = preg_replace("/[\t\s]+/", ' ', trim($line));
                $temp_results = explode(" ", trim($data));
                foreach ($temp_results as $result) {
                    if (!empty($result)) {
                        $results[] = self::$programming_language_map[$lang] .
                            self::CONTROL_WORD_INDICATOR . trim($result);
                    }
                }
                $current_length = (strlen($matches[0][0]));
                $string = trim(substr($string, $current_length, $length));
            }
        }
        return $results;
    }
    /**
     * Given an array of pre_terms returns the characters n-grams for the
     * given terms where n is the length Yioop uses for the language in
     * question. If a stemmer is used for language then n-gramming is not
     * done and this just returns an empty array this method differs from
     * getCharGramsTerm in that it may do checking of certain words and
     * not char gram them. For example, it won't char gram urls.
     *
     * @param array $pre_terms the terms to make n-grams for
     * @param string $lang locale tag to determine n to be used for n-gramming
     *
     * @return array the n-grams for the terms in question
     */
    public static function charGramTerms($pre_terms, $lang)
    {
        $char_class = C\NS_LOCALE . $lang . "\\resources\\Tokenizer";
        mb_internal_encoding("UTF-8");
        if (empty($pre_terms)) {
            return [];
        }
        $terms = [];
        $tokenizer = PhraseParser::getTokenizer($lang);
        if (class_exists($char_class) && isset($char_class::$char_gram_len)) {
            foreach ($pre_terms as $pre_term) {
                if (empty($pre_term)) {
                    continue;
                }
                if (substr($pre_term, 0, 4) == 'http') {
                    $terms[]  = $pre_term; // don't chargram urls
                    continue;
                }
                $ngrams = self::getCharGramsTerm([$pre_term], $lang);
                if (count($ngrams) > 0) {
                    $terms = array_merge($terms, $ngrams);
                }
            }
        } else {
            $terms = $pre_terms;
        }
        return $terms;
    }
    /**
     * Returns the characters n-grams for the given terms where n is the length
     * Yioop uses for the language in question. If a stemmer is used for
     * language then n-gramming is not done and this just returns an empty
     * array
     *
     * @param array $terms the terms to make n-grams for
     * @param string $lang locale tag to determine n to be used for n-gramming
     *
     * @return array the n-grams for the terms in question
     */
    public static function getCharGramsTerm($terms, $lang)
    {
        $char_class = C\NS_LOCALE . $lang . "\\resources\\Tokenizer";
        mb_internal_encoding("UTF-8");
        if (class_exists($char_class) && isset($char_class::$char_gram_len)) {
            $n = $char_class::$char_gram_len;
        } else {
            return [];
        }
        return self::getNGramsTerm($terms, $n);
    }
    /**
     * Returns the characters n-grams for the given terms where n is the
     * length.
     *
     * @param array $terms the terms to make n-grams for
     * @param string $n the n to use in n-gramming
     *
     * @return array the n-grams for the terms in question
     */
    public static function getNGramsTerm($terms, $n)
    {
        mb_internal_encoding("UTF-8");
        $ngrams = [];
        foreach ($terms as $term) {
            $pre_gram = $term;
            $last_pos = mb_strlen($pre_gram) - $n;
            if ($last_pos < 0) {
                $ngrams[] = $pre_gram;
            } else {
                for ($i = 0; $i <= $last_pos; $i++) {
                    $tmp = mb_substr($pre_gram, $i, $n);
                    if ($tmp != "") {
                        $ngrams[] = $tmp;
                    }
                }
            }
        }
        return $ngrams;
    }
    /**
     * Given a string to segment into words (where strings might
     * not contain spaces), this function segments them according to the given
     * locales segmenter
     *
     * Note: this method is not used when trying to extract keywords from urls.
     * Instead, UrlParser::getWordsInHostUrl($url) is used.
     *
     * @param string $segment string to split into terms
     * @param string $lang IANA tag to look up segmenter under
     *     from some other language
     * @param array of terms found in the segments
     */
    public static function segmentSegment($segment, $lang)
    {
        static $non_hyphens = "";
        if (empty($non_hyphens)) {
            $non_hyphens = str_replace("-|", "", C\PUNCT);
        }
        if (empty($segment) || empty($lang)) {
            return [];
        }
        $segment_obj = self::getTokenizer($lang);
        $term_string = "";
        if (!empty($segment_obj) && method_exists($segment_obj, "segment")
            && (!preg_match("/\-/u", $segment) || in_array($lang, ["zh",
            "zh-CN", "ko", "jp"]))) {
            $term_string .= $segment_obj->segment($segment);
        } else {
            $term_string = $segment;
        }
        $term_string ??= "";
        $terms = preg_split("/(\s|$non_hyphens)+/u",
            mb_strtolower(trim($term_string)));
        $filter_terms = array_filter($terms) ?? [];
        $terms = array_values($filter_terms);
        return $terms;
    }
    /**
     * Splits supplied string based on white space, then stems each
     * terms according to the stemmer for $lang if exists
     *
     * @param mixed $string_or_array to extract stemmed terms from
     * @param string $lang IANA tag to look up stemmer under
     * @return array stemmed terms if stemmer; terms otherwise
     */
    public static function stemTerms($string_or_array, $lang)
    {
        return self::stemTermsK($string_or_array, $lang, false);
    }
    /**
    * Splits supplied string based on white space, then stems each
     * terms according to the stemmer for $lang if exists
     *
     * @param mixed $string_or_array to extract stemmed terms from
     * @param string $lang IANA tag to look up stemmer under
     * @param string $keep_empties whether to keep empty sentences or not
     * @return array stemmed terms if stemmer; terms otherwise
     */
    public static function stemTermsK($string_or_array, $lang, $keep_empties)
    {
        if (empty($string_or_array)) {
            return [];
        }
        if (is_array($string_or_array)) {
            $terms = [];
            if ($keep_empties) {
                $terms = $string_or_array;
            } else {
                foreach ($string_or_array as $pre_term) {
                    $term = trim($pre_term);
                    if (!empty($term)) {
                        $terms[] = $term;
                    }
                }
            }
        } else {
            $terms = mb_split("[[:space:]]", $string_or_array);
        }
        $stem_obj = self::getTokenizer($lang);
        if (empty($stem_obj) || !method_exists($stem_obj, "stem")) {
            return $terms;
        }
        $stems = [];
        foreach ($terms as $term) {
            $stems[] = (strpos($term, "_") === false) ?
                $stem_obj->stem($term) : $term;
        }
        return $stems;
    }
    /**
     * Loads and instantiates a tokenizer object for a language if exists
     *
     * @param string $lang IANA tag to look up stemmer under
     * @return object tokenizer with methods to process strings for a language
     */
    public static function getTokenizer($lang)
    {
        if (isset(self::$tokenizers[$lang])) {
            return self::$tokenizers[$lang];
        }
        if (empty($lang)) {
            return null;
        }
        mb_regex_encoding('UTF-8');
        mb_internal_encoding("UTF-8");
        $lower_lang = strtolower($lang); //try to avoid case sensitivity issues
        $lang_parts = explode("-", $lang);
        if (!isset($lang_parts[1])) {
            $tokenizer_list = glob(C\LOCALE_DIR .
                "/$lang*/resources/Tokenizer.php");
            if (isset($tokenizer_list[0])) {
                $tag = substr($tokenizer_list[0], strlen(C\LOCALE_DIR) + 1,
                    - strlen("/resources/Tokenizer.php"));
            } else {
                $tag = "";
            }
        } else {
            $tag = str_replace("-", "_", $lang);
            if (!file_exists(C\LOCALE_DIR . "/$tag/resources/Tokenizer.php")) {
                $tokenizer_list = glob(C\LOCALE_DIR .
                    "/{$lang_parts[0]}*/resources/Tokenizer.php");
                if (isset($tokenizer_list[0])) {
                    $tag = substr($tokenizer_list[0], strlen(C\LOCALE_DIR) + 1,
                        - strlen("/resources/Tokenizer.php"));
                } else {
                    $tag = "";
                }
            }
        }
        $tokenizer_class_name = C\NS_LOCALE . "$tag\\resources\\Tokenizer";
        if (class_exists($tokenizer_class_name)) {
            $tokenizer_obj = new $tokenizer_class_name();
        } else {
            $tokenizer_obj = null;
        }
        self::$tokenizers[$lang] = $tokenizer_obj;
        return $tokenizer_obj;
    }
    /**
     * Calculates the meta words to be associated with a given downloaded
     * document. These words will be associated with the document in the
     * index for (server:apache) even if the document itself did not contain
     * them.
     *
     * @param array &$site associated array containing info about a downloaded
     *     (or read from archive) document.
     * @param bool $with_link_metas whether to extract link: meta tags too
     * @return array of meta words to be associate with this document
     */
    public static function calculateMetas(&$site, $with_link_metas = true)
    {
        // handles user added meta words
        if (empty($site[CrawlConstants::META_WORDS])) {
            $site[CrawlConstants::META_WORDS] = [];
        }
        $meta_ids = $site[CrawlConstants::META_WORDS];
        /*
            Handle the built-in meta words. For example
            store the sites the doc_key belongs to,
            so you can search by site
        */
        //we lower case URL even though can cause ambiguity between site paths
        $site_url = mb_strtolower($site[CrawlConstants::URL]);
        $url_sites = UrlParser::getHostPaths($site_url);
        $url_sites = array_merge($url_sites,
            UrlParser::getHostSubdomains($site_url));
        $meta_ids[] = 'site:all';
        foreach ($url_sites as $url_site) {
            if (strlen($url_site) > 0) {
                $meta_ids[] = 'site:' . $url_site;
            }
        }
        $path =  UrlParser::getPath($site_url) ?? "";
        if (strlen($path) > 0 ) {
            $path_parts = explode("/", $path);
            $pre_path = "";
            $meta_ids[] = 'path:all';
            $meta_ids[] = 'path:/';
            foreach ($path_parts as $part) {
                if (strlen($part) > 0 ) {
                    $pre_path .= "/$part";
                    $meta_ids[] = 'path:' . $pre_path;
                }
            }
        }
        if (isset($site[CrawlConstants::HASH])) {
            $meta_ids[] = 'hash:' . base64_encode($site[CrawlConstants::HASH]);
        }
        $meta_ids[] = 'info:' . $site_url;
        $meta_ids[] = 'info:' . crawlHash($site_url);
        $meta_ids[] = 'code:all';
        if (isset($site[CrawlConstants::HTTP_CODE])) {
            $meta_ids[] = 'code:' . $site[CrawlConstants::HTTP_CODE];
        }
        if (UrlParser::getHost($site_url) . "/" ==
            $site_url) {
            $meta_ids[] = 'host:all'; //used to count number of distinct hosts
        }
        if (isset($site[CrawlConstants::SIZE])) {
            $meta_ids[] = "size:all";
            $interval = C\DOWNLOAD_SIZE_INTERVAL;
            $size = floor($site[CrawlConstants::SIZE]/$interval) * $interval;
            $meta_ids[] = "size:$size";
        }
        if (isset($site[CrawlConstants::TOTAL_TIME])) {
            $meta_ids[] = "time:all";
            $interval = C\DOWNLOAD_TIME_INTERVAL;
            $time = floor(
                $site[CrawlConstants::TOTAL_TIME]/$interval) * $interval;
            $meta_ids[] = "time:$time";
        }
        if (isset($site[CrawlConstants::DNS_TIME])) {
            $meta_ids[] = "dns:all";
            $interval = C\DOWNLOAD_TIME_INTERVAL;
            $time = floor(
                $site[CrawlConstants::DNS_TIME]/$interval) * $interval;
            $meta_ids[] = "dns:$time";
        }
        if (isset($site[CrawlConstants::LINKS]) &&
            is_array($site[CrawlConstants::LINKS])) {
            $num_links = count($site[CrawlConstants::LINKS]);
            $meta_ids[] = "numlinks:all";
            $meta_ids[] = "numlinks:$num_links";
            $link_urls = array_keys($site[CrawlConstants::LINKS]);
            if ($with_link_metas) {
                $meta_ids[] = "link:all";
                foreach ($link_urls as $url) {
                    $meta_ids[] = 'link:' . $url;
                    $meta_ids[] = 'link:' . crawlHash($url);
                }
            }
        }
        if (isset($site[CrawlConstants::CLD_IN_COMMON])) {
            $meta_ids[] = 'cld:' . $site[CrawlConstants::CLD_IN_COMMON];
        }
        if (isset($site[CrawlConstants::LOCATION]) &&
            is_array($site[CrawlConstants::LOCATION])){
            foreach ($site[CrawlConstants::LOCATION] as $location) {
                $meta_ids[] = 'info:' . $location;
                $meta_ids[] = 'info:' . crawlHash($location);
                $meta_ids[] = 'location:all';
                $meta_ids[] = 'location:' . $location;
            }
        }
        if (isset($site[CrawlConstants::IP_ADDRESSES]) ){
            $meta_ids[] = 'ip:all';
            foreach ($site[CrawlConstants::IP_ADDRESSES] as $address) {
                $meta_ids[] = 'ip:' . $address;
            }
        }
        $meta_ids[] = 'media:all';
        if (!empty($site[CrawlConstants::IS_VIDEO])) {
            $meta_ids[] = "media:video";
            if (!empty($site[CrawlConstants::DURATION])) {
                $durations = [ 60, 300, 600, 900, 1800, 3600, 7200];
                $duration = intval($site[CrawlConstants::DURATION]);
                if ($duration > 0) {
                    foreach ($durations as $time) {
                        if ($duration > $time) {
                            $meta_ids[] = "duration:$time-plus";
                        } else {
                            $meta_ids[] = "duration:$time-minus";
                        }
                    }
                }
            }
            if (!empty($site[CrawlConstants::HEIGHT])) {
                $meta_ids[] = ($site[CrawlConstants::HEIGHT] >= 4000) ?
                    "media:video-4k-plus" : "media:video-4k-minus";
                $meta_ids[] = ($site[CrawlConstants::HEIGHT] >= 1080) ?
                    "media:video-fhd-plus" : "media:video-fhd-minus";
                $meta_ids[] = ($site[CrawlConstants::HEIGHT] >= 720) ?
                    "media:video-hd-plus" : "media:video-hd-minus";
            }
        } else if (!empty($site[CrawlConstants::TYPE]) &&
            stripos($site[CrawlConstants::TYPE], "image") !== false) {
            if (!empty($site[CrawlConstants::WIDTH]) &&
                !empty($site[CrawlConstants::HEIGHT])) {
                $size = $site[CrawlConstants::WIDTH] *
                    $site[CrawlConstants::HEIGHT];
                if (empty($site[CrawlConstants::THUMB])) {
                    $meta_ids[] = 'media:image-no-thumb';
                } else if($size < 50) {
                    $meta_ids[] = 'media:image-tracking';
                } else {
                    if ($size < 100000) {
                        $meta_ids[] = 'media:image-small';
                    } else if ($size < 400000) {
                        $meta_ids[] = 'media:image-medium';
                    } else {
                        $meta_ids[] = 'media:image-large';
                    }
                    $meta_ids[] = 'media:image';
                }
                if ($site[CrawlConstants::WIDTH] >
                    1.1 * $site[CrawlConstants::HEIGHT]) {
                    $meta_ids[] = 'layout:wide';
                } else if ($site[CrawlConstants::HEIGHT] >
                    1.1 * $site[CrawlConstants::WIDTH]) {
                    $meta_ids[] = 'layout:tall';
                } else {
                    $meta_ids[] = 'layout:square';
                }
            } else {
                if (empty($site[CrawlConstants::THUMB])) {
                    $meta_ids[] = 'media:image-no-thumb';
                } else {
                    $meta_ids[] = 'media:image';
                }
            }
            if (!empty($site[CrawlConstants::IS_BLACK_AND_WHITE])) {
                $meta_ids[] = 'color:bw';
            } else {
                $meta_ids[] = 'color:color-only';
            }
            if (!empty($site[CrawlConstants::AVERAGE_COLOR])) {
                $colors = [ "black" => [0,0,0], "white" => [255,255,255],
                    "red" => [255,0,0], "lime" => [0,255,0],
                    "blue" => [0,0,255], "yellow" => [255,255,0],
                    "cyan" => [0,255,255], "magenta" => [255,0,255],
                    "silver" => [192,192,192], "gray" => [128,128,128],
                    "maroon" => [128,0,0], "olive" => [128,128,0],
                    "green" => [0,128,0], "purple" => [128,0,128],
                    "teal" => [0,128,128], "navy" => [0,0,128],
                ];
                $avg_color = $site[CrawlConstants::AVERAGE_COLOR];
                $best_color = "black";
                $best_distance = 270000; // bigger than 3*255*255
                foreach ($colors as $color_name => $color_vector) {
                    $color_distance = 0;
                    for ($j = 0; $j < 3; $j++) {
                        $diff_color = $color_vector[$j] - $avg_color[$j];
                        $color_distance += $diff_color * $diff_color;
                    }
                    if ($color_distance < $best_distance) {
                        $best_color = $color_name;
                        $best_distance = $color_distance;
                    }
                }
                $meta_ids[] = 'color:' . $best_color;
            }
        } else {
            $meta_ids[] = 'media:text';
        }
        if (!empty($site[CrawlConstants::IS_VR])) {
            $meta_ids[] = "media:vr";
        }
        // store the filetype info
        $url_type = UrlParser::getDocumentType($site_url);
        if (strlen($url_type) > 0) {
            $meta_ids[] = 'filetype:all';
            $meta_ids[] = 'filetype:' . $url_type;
        }
        if (isset($site[CrawlConstants::SERVER])) {
            $meta_ids[] = 'server:all';
            $meta_ids[] = 'server:' . strtolower($site[CrawlConstants::SERVER]);
        }
        if (isset($site[CrawlConstants::SERVER_VERSION])) {
            $meta_ids[] = 'version:all';
            $meta_ids[] = 'version:' .
                $site[CrawlConstants::SERVER_VERSION];
        }
        if (isset($site[CrawlConstants::OPERATING_SYSTEM])) {
            $meta_ids[] = 'os:all';
            $meta_ids[] = 'os:'. strtolower(
                $site[CrawlConstants::OPERATING_SYSTEM]);
        }
        if (isset($site[CrawlConstants::MODIFIED])) {
            $modified = $site[CrawlConstants::MODIFIED];
            $meta_ids[] = 'modified:all';
            $meta_ids[] = 'modified:' . date('Y', $modified);
            $meta_ids[] = 'modified:' . date('Y-m', $modified);
            $meta_ids[] = 'modified:' . date('Y-m-W', $modified);
            $meta_ids[] = 'modified:' . date('Y-m-d', $modified);
        }
        // date appeared on internet
        if (!empty($site[CrawlConstants::TIMESTAMP])) {
            $date = $site[CrawlConstants::TIMESTAMP];
            $meta_ids[] = 'date:all';
            $meta_ids[] = 'date:' . date('Y', $date);
            $meta_ids[] = 'date:' . date('Y-m', $date);
            $meta_ids[] = 'date:' . date('Y-m-W', $date);
            $meta_ids[] = 'date:' . date('Y-m-d', $date);
            $meta_ids[] = 'date:' . date('Y-m-d-H', $date);
            $meta_ids[] = 'date:' . date('Y-m-d-H-i', $date);
            $meta_ids[] = 'date:' . date('Y-m-d-H-i-s', $date);
        }
        // date published as given by open graph or rss
        if (!empty($site[CrawlConstants::PUBDATE])) {
            $date = $site[CrawlConstants::PUBDATE];
            $meta_ids[] = 'pubdate:all';
            $meta_ids[] = 'pubdate:' . date('Y', $date);
            $meta_ids[] = 'pubdate:' . date('Y-m', $date);
            $meta_ids[] = 'pubdate:' . date('Y-m-W', $date);
            $meta_ids[] = 'pubdate:' . date('Y-m-d', $date);
            $meta_ids[] = 'pubdate:' . date('Y-m-d-H', $date);
            $meta_ids[] = 'pubdate:' . date('Y-m-d-H-i', $date);
            $meta_ids[] = 'pubdate:' . date('Y-m-d-H-i-s', $date);
        }
        if (isset($site[CrawlConstants::LANG])) {
            $meta_ids[] = 'lang:all';
            $lang = strtolower($site[CrawlConstants::LANG]);
            $lang_parts = explode("-", $lang);
            $meta_ids[] = 'lang:' . $lang_parts[0];
            if (isset($lang_parts[1])) {
                $meta_ids[] = 'lang:' . $lang;
            }
            if ($lang == 'mul') {
                foreach (localesWithStopwordsList() as $lang) {
                    $lang_parts = explode("-", $lang);
                    $meta_ids[] = 'lang:' . $lang_parts[0];
                }
            }
        }
        if (isset($site[CrawlConstants::AGENT_LIST])) {
            foreach ($site[CrawlConstants::AGENT_LIST] as $agent) {
                $meta_ids[] = 'robot:' . strtolower($agent);
            }
        }
        //Add all meta word for subdoctype
        if (isset($site[CrawlConstants::SUBDOCTYPE])){
            $meta_ids[] = $site[CrawlConstants::SUBDOCTYPE] . ':all';
        }
        $meta_ids[] = "safe:all";
        if (!empty($site[CrawlConstants::IS_SAFE])) {
            $meta_ids[] = "safe:true";
        } else if (isset($site[CrawlConstants::IS_SAFE])) {
            $meta_ids[] = "safe:false";
        }
        return $meta_ids;
    }
    /**
     * Used to compute all the meta ids for a given link with $url
     * and $link_text that was on a site with $site_url.
     *
     * @param string $url url of the link
     * @param string $link_host url of the host name of the link
     * @param string $link_text text of the anchor tag link came from
     * @param string $site_url url of the page link was on
     * @param array $url_info key value pairs which may have been generated
     *  as part of the page processor
     * @param array $link_word_lists list of words used in anchor text
     *  associated with this link and their positions in the anchor text
     * @return array meta words associated with the link
     */
    public static function calculateLinkMetas($url, $link_host, $link_text,
        $site_url, $url_info = [], $link_word_lists = [])
    {
        $link_meta_ids = [];
        if (strlen($link_host) == 0) {
            return $link_meta_ids;
        }
        if (substr($link_text, 0, 9) == "location:") {
            $location_link = true;
            $link_meta_ids[] = $link_text;
            $link_meta_ids[] = "location:all";
            $link_meta_ids[] = "location:" . crawlHash($site_url);
        }
        $link_type = UrlParser::getDocumentType($url);
        $link_meta_ids[] = "media:all";
        $link_meta_ids[] = "safe:all";
        if (!empty($url_info[CrawlConstants::IS_SAFE])) {
            $link_meta_ids[] = "safe:true";
        } else if (isset($url_info[CrawlConstants::IS_SAFE])) {
            $link_meta_ids[] = "safe:false";
        }
        /* Assumes PageProcessor::$image_types populated. True if called
           from Fetcher or CrawlComponent
         */
        if (in_array($link_type, PageProcessor::$image_types)) {
            $link_meta_ids[] = "media:image-link";
        } else if (!empty($url_info['media'])) {
            $link_meta_ids[] = "media:" . $url_info['media'];
        } else {
            $link_meta_ids[] = "media:text";
        }
        if (!empty($url_info['pubdate']) &&
            $date = strtotime($url_info['pubdate'])) {
            $link_meta_ids[] = 'date:all';
            $link_meta_ids[] = 'date:' . date('Y', $date);
            $link_meta_ids[] = 'date:' . date('Y-m', $date);
            $link_meta_ids[] = 'date:' . date('Y-m-d', $date);
            $link_meta_ids[] = 'date:' . date('Y-m-d-H', $date);
            $link_meta_ids[] = 'date:' . date('Y-m-d-H-i', $date);
            $link_meta_ids[] = 'date:' . date('Y-m-d-H-i-s', $date);
        }
        $link_meta_ids[] = "link:all";
        foreach ($link_word_lists as $term => $pos) {
            $link_meta_ids[] = "link:$term";
            $link_meta_ids[] = "link:". crawlHash($term);
            $link_meta_ids[] = "link:$term:$url";
            $link_meta_ids[] = "link:". crawlHash($term) . ":" .
                crawlHash($url);
        }
        if (!empty($url_info[CrawlConstants::LANG])) {
            $link_meta_ids[] = 'lang:all';
            $lang = strtolower($url_info[CrawlConstants::LANG]);
            $lang_parts = explode("-", $lang);
            $link_meta_ids[] = 'lang:' . $lang_parts[0];
            if (isset($lang_parts[1])) {
                $link_meta_ids[] = 'lang:' . $lang;
            }
            if ($lang == 'mul') {
                foreach (localesWithStopwordsList() as $lang) {
                    $lang_parts = explode("-", $lang);
                    $link_meta_ids[] = 'lang:' . $lang_parts[0];
                }
            }
        }
        return $link_meta_ids;
    }
    /**
     * Used to split a string of text in the language given by $locale into
     * space separated words. Ex: "acontinuousstringofwords" becomes
     * "a continuous string of words". It operates by scanning from the end of
     * the string to the front and splitting on the longest segment that is a
     * word.
     *
     * @param string $segment string to make into a string of space separated
     *     words
     * @param string $locale IANA tag used to look up dictionary filter to
     *     use to do this segmenting
     * @param array $additional_regexes which should be treated as a suffix
     * @return string space separated words
     */
    public static function reverseMaximalMatch($segment, $locale,
        $additional_regexes =[])
    {
        $segment = " " . $segment;
        $len = mb_strlen($segment);
        $cur_pos = $len;
        if ($cur_pos < 1) {
            return $segment;
        }
        $out_segment = "";
        $char_added = "";
        $word_guess = "";
        $was_space = true;
        while($cur_pos > 0) {
            $cur_pos--;
            $char_added =  mb_substr($segment, $cur_pos, 1);
            $is_space = trim($char_added) == "";
            if ($is_space && $was_space) {
                continue;
            } else if ($is_space) {
                $was_space = true;
                $one_word = self::oneWord($word_guess, $locale,
                    $additional_regexes);
                if ($one_word) {
                    $out_segment .= " " . strrev($word_guess);
                    $word_guess = "";
                } else {
                    $out_segment .= " " . strrev(mb_substr($word_guess, 1));
                    $out_segment .= " " . $char_added;
                    $word_guess = "";
                }
                continue;
            } else {
                $word_guess = $char_added . $word_guess;
                $was_space = false;
            }
            $is_suffix = NWordGrams::ngramsContains("*" . $word_guess,
                $locale, "segment");
            if (!$is_suffix) {
                foreach ($additional_regexes as $regex) {
                    if (preg_match($regex, $word_guess)) {
                        $is_suffix = true;
                        break;
                    }
                }
            }
            if (!$is_suffix) {
                if (mb_strlen($word_guess) > 1 &&
                    !self::oneWord($word_guess, $locale, $additional_regexes)) {
                    $out_segment .= " " . strrev(mb_substr($word_guess, 1));
                    $word_guess = $char_added;
                } else {
                    $out_segment .= " " . strrev($word_guess);
                    $word_guess = "";
                }
                $was_space = false;
            }
        }
        $out_segment = strrev($out_segment);
        return $out_segment;
    }
    /**
     * Checks if a given word guess is a single word with respect to
     * a word detection bloom filter and regexes
     *
     * @param string $word_guess word guess to be checked if a single word
     * @param string $locale language to check if is word for
     * @param array $additional_regexes used in checking for this locale if
     *  something should be considered a word
     * @return bool true if a single word false otherwise
     */
    public static function oneWord($word_guess, $locale, $additional_regexes)
    {
        $one_word = false;
        if (NWordGrams::ngramsContains($word_guess, $locale,
            "segment")) {
            $one_word = true;
        } else {
            foreach ($additional_regexes as $regex) {
                if (preg_match($regex, $word_guess)) {
                    $one_word = true;
                    break;
                }
            }
        }
        return $one_word;
    }
    /**
     * Scores documents according to the lack or nonlack of sexually explicit
     * terms. Tries to work for several languages. Very crude classifier.
     *
     * @param string $phrase to check for X-ratedness
     * @param string $url optional url that the word_list came used to check
     *  against known porn sites
     * @return int $score of how explicit the phrase is between 0 and 1
     */
    public static function computeSafeSearchScore($phrase, $url = "")
    {
        static $pre_unsafe_regex = "XXX|sex|slut|nymphomaniac|MILF|lolita|" .
            "lesbian|sadomasochism|bondage|fisting|erotic|vagina|Tribadism|" .
            "penis|facial|hermaphrodite|transsexual|tranny|bestiality|snuff|" .
            "boob|fondle|tit|blowjob|lap|cock|dick|hardcore|pr0n|fuck|pussy|" .
            "penetration|ass|cunt|bisexual|prostitution|screw|ass|swinging|" .
            "masturbation|clitoris|clit|suck|whore|bitch|cuckold|porn|melon|" .
            "femdom|exhibitionism|bellaco|cachar|chingar|shimar|chinquechar|" .
            "chichar|clavar|coger|culear|hundir|joder|mámalo|singar|cojon|" .
            "carajo|caray|bicho|concha|chucha|chocha|chuchamadre|coño|" .
            "panocha|almeja|culo|fundillo|fundío|puta|puto|teta|connorito|" .
            "cul|pute|putain|sexe|pénis|vulve|foutre|baiser|sein|nicher|" .
            "nichons|puta|sapatão|foder|ferro|punheta|vadia|buceta|bucetinha|" .
            "bunda|caralho|mentula|cunnus|verpa|sōpiō|pipinna|cōleī|" .
            "cunnilingus|futuō|copulate|cēveō|crīsō|scortor|meretrīx|" .
            "futatrix|minchia|coglione|cornuto|culo|inocchio|frocio|puttana|" .
            "vaffanculo|fok|hoer|kut|lul|やりまん|打っ掛け|二形|ふたなりゴックン|" .
            "ゴックン|ショタコン|全裸|受け|裏本|пизда́|хуй|еба́ть|блядь|елда́|гондо́н|" .
            "хер|манда́|му́ди|мудя|пидора́с|залу́па|жо́па|за́дница|буфер|雞巴|鷄巴|" .
            "雞雞|鷄鷄|阴茎|陰莖|胯下物|屌|吊|小鳥|龟头|龜頭|屄|鸡白|雞白|傻屄|老二|" .
            "那话儿|那話兒|屄|鸡白|雞白|阴道|陰道|阴户|陰戶|大姨妈|淫蟲|老嫖|妓女|" .
            "臭婊子|卖豆腐|賣豆腐|咪咪|大豆腐|爆乳|肏操|炒饭|炒飯|cặc|lồn|kaltak|" .
            "orospu|siktir|sıçmak|amcık";
        static $boundary_regex = "";
        static $no_boundary_regex = "";
        /* took keywords from top level domains from some of theporndude list
         */
        static $unsafe_url_regex = "/porn|xvideos|livejasmin|".
            "xhamster|bongacams|chaturbate|pussy|spankbang|".
            "xnxx|tnxx|beeg|daftsex|redtube|youjizz|vidz7|4tube|cumlouder|" .
            "tnaflix|xfantasy|vdiz24|luxuretv|perfectgirls|anysex|drtuber|" .
            "waxtube|netfapx|xmoviesforyou|letsjerk|likuoo|xxxstreams|horny|" .
            "freeomovie|cliphunter|xtapes|sweext|slut|xkeezmovies|sexgalaxy|" .
            "motherless|xopenload|palimas|sextvx|hotgirlclub|sexu|pandamovies|".
            "xxxstreams|fullxxxmovies|palmtube|fakingstv|eroprofile|hclips|" .
            "xtube|whore|voyeurhit|thothub|hotscope|gonewild|nsfwonsnap|" .
            "watchmygf|yuvutu|camvideos|reallifecam|uflash|nudevista|" .
            "findtubes|rexxx|ro89|bellesa|forhertube|ixxx|thumbzilla|fuq|" .
            "tubegalore|alohatube|elephanttube|iwank|porzo|lobstertube|" .
            "maturetube|dinotube|melonstube|assoass|tonicmovies|videoone|" .
            "video-one|tiava|fapvid|tubegals|voyeur\-house|voyeurhouse|" .
            "incest|milf|camarads|lifeundercams|tushy|trueanal|analized|" .
            "elegantanal|girlsway|whengirlsplay|welivetogether|mommysgirl|" .
            "fleshlight|sexysexdoll|realdoll|adameve|adultempire|j\-list|".
            "efukt|shooshtime|inhumanity|humoron|9gag2|crazyshit|theync|".
            "kaotic|xrares|reblop|sickjunk|cutscenes|bestgore|".
            "escortdirectory|eurogirlsescort|erotic|".
            "skipthegames|escortmeetings|slixa|tsescorts|ts4rent|adultsearch|".
            "escortnews|escortguide|uescort|adultwork|humpchies|".
            "scarletblue|locanto|skokka|escort-ireland|newzealandgirls|".
            "listcrawler|cityxguide|damvler|nhentai|flirt4free|".
            "furaffinity|spankwire|planetsuzy|ebaumsworld|".
            "luscious\.net|hentai|freeones\.com|iafd|gayboystube|".
            "adam4adam|cams\.com|mrskin|adultwork|oglaf|streamate|".
            "nifty\.org|adultdvd|suicidegirls|ftvgirls|asstr|private\.com|".
            "squirt\.org|fakku|faapy|fux|txxx|\Wnude\W/i";
        if (empty($boundary_regex)) {
            $boundary_regex = "/\b$pre_unsafe_regex\b/ui";
            $no_boundary_regex = "/$pre_unsafe_regex/ui";
        }
        if (!empty($url) && preg_match($unsafe_url_regex, $url)) {
            return 1;
        }
        if (empty($phrase)) {
            return 0;
        } else if (!is_string($phrase)) { // wrong type is X-rated!
            return 1;
        }
        $term_boundaries = preg_match_all("/\b/", $phrase);
        $len = max(mb_strlen($phrase), 1);
        /*
           8 characters is greater than the average word length for most
           languages. So if the number of term boundaries is < the length
           of the string/8, likely we have a language which doesn't use
           word boundaries like Chinese. In this case, we will assume
           around 3 character per word (maybe higher for Chinese, low for
           Japanese or Korean?)
         */
        if ($term_boundaries < ceil($len/8)) { //maybe text
            $term_boundaries = ceil($len/3);
            $unsafe_regex = $no_boundary_regex;
        } else {
            $unsafe_regex = $boundary_regex;
        }
        $match_count = preg_match_all($unsafe_regex, $phrase);
        $score = $match_count/$term_boundaries;
        return $score;
    }
    /**
     * Call the appropriate tokenizer sentence compression method
     *
     * @param string $sentence_to_compress the sentence to compress
     * @param string $lang locale tag for stemming
     * @return the compressed sentence
     */
    public static function compressSentence($sentence_to_compress,
        $lang = null)
    {
        $result = $sentence_to_compress;
        if (C\SENTENCE_COMPRESSION_ENABLED) {
            if (!empty($lang)) {
                $segment_obj = self::getTokenizer($lang);
            } else {
                $segment_obj = null;
            }
            if (!empty($segment_obj) && method_exists($segment_obj,
                "compressSentence")) {
                $result =
                    $segment_obj->compressSentence($sentence_to_compress);
            }
        }
        return $result;
    }
}
ViewGit