Fixes to get the question answering system to display again, a=chris

Chris Pollett [2022-08-19 15:Aug:th]
Fixes to get the question answering system to display again, a=chris
Filename
src/controllers/SearchController.php
src/library/processors/HtmlProcessor.php
src/locale/en_US/resources/Tokenizer.php
src/models/PhraseModel.php
src/views/elements/SearchElement.php
diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php
index 1080a47a7..bb3ac901c 100755
--- a/src/controllers/SearchController.php
+++ b/src/controllers/SearchController.php
@@ -170,7 +170,6 @@ class SearchController extends Controller implements CrawlConstants
             if (isset($data["PAGES"])) {
                 $count = count($data["PAGES"]);
                 for ($i = 0; $i < $count; $i++) {
-                    unset($data["PAGES"][$i]["OUT_SCORE"]);
                     $data["PAGES"][$i][self::SCORE]= "".
                         round($data["PAGES"][$i][self::SCORE], 3);
                     $data["PAGES"][$i][self::DOC_RANK]= "".
diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php
index b571f140b..bda6ff06b 100755
--- a/src/library/processors/HtmlProcessor.php
+++ b/src/library/processors/HtmlProcessor.php
@@ -273,7 +273,7 @@ class HtmlProcessor extends TextProcessor
         $out_links = [];
         foreach ($links as $link_url => $link_text) {
             // Avoid redirects in top-level links
-            if (preg_match("/^Location/i", $link_text)) {
+            if (preg_match("/^(Location|http)/i", $link_text)) {
                 continue;
             }
             $cld = UrlParser::getCompanyLevelDomain($url);
diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php
index 3c104d8b6..afe21e5d9 100755
--- a/src/locale/en_US/resources/Tokenizer.php
+++ b/src/locale/en_US/resources/Tokenizer.php
@@ -387,7 +387,9 @@ class Tokenizer
             $current = ['token' => $token, 'tag' => 'NN'];
             // remove trailing full stops
             $token = mb_strtolower($token);
-            if (!empty($dictionary[$token])) {
+            if ($token == "howto") {
+                $current['tag'] = 'WP';
+            } else if (!empty($dictionary[$token])) {
                 $tag_list = explode(" ", $dictionary[$token]);
                 $current['tag'] = $tag_list[0];
             }
@@ -993,6 +995,7 @@ class Tokenizer
             $word_and_phrase = preg_replace("/[\{\[\(][^\}\]\)]+[\}\]\)]/u",
                 "", $word_and_phrase);
             $tagged_phrase = self::tagTokenizePartOfSpeech($word_and_phrase);
+            print_r($tagged_phrase);
             $parse_tree = self::parseWholePhrase($tagged_phrase,
                 ['cur_node' => 0]);
             $triplets = self::extractTripletsParseTree($parse_tree);
@@ -1221,6 +1224,7 @@ class Tokenizer
      */
     public static function questionParser($question)
     {
+        $question = preg_replace("/how\s+to/", "howto", $question);
         $tagged_question = self::tagTokenizePartOfSpeech($question);
         $generated_questions = [];
         if (isset($tagged_question[0])) {
@@ -1230,7 +1234,8 @@ class Tokenizer
                 if ($token == "WHO") {
                     $generated_questions = self::parseWhoQuestion(
                         $tagged_question, 1);
-                } else if (in_array($token, ["WHERE", "WHEN", "WHAT"])) {
+                } else if (in_array($token, ["WHERE", "WHEN", "WHAT",
+                    "HOW", "HOWTO"])) {
                     $generated_questions = self::parseWHPlusQuestion(
                         $tagged_question, 1);
                 }
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index 4d3f381e9..57185efee 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -368,6 +368,7 @@ class PhraseModel extends ParallelModel
                     }
                 }
                 if ($guess_semantics) {
+                    $original_has_disjuncts = (strpos($phrase, "|") !== false);
                     $repeat_check = [];
                     $phrase = "";
                     $delim = " ";
@@ -432,28 +433,40 @@ class PhraseModel extends ParallelModel
                         $i < min($bound[0] + $bound[1], $results_high);
                         $i++) {
                         if (isset($out_results['PAGES'][$out_count])) {
-                            if (!strstr($phrase, "|") &&
+                            if (empty($original_has_disjuncts) &&
                                 isset($out_results['PAGES'][$out_count]
                                 [self::QUESTION_ANSWERS])) {
                                 $triplets_with_answer =
                                     $out_results['PAGES'][$out_count]
                                     [self::QUESTION_ANSWERS];
-                                $question = trim(
-                                    PhraseParser::stemCharGramSegment($phrase,
-                                    L\guessLocaleFromString($phrase), true));
-                                if (isset($triplets_with_answer[$question])) {
-                                    $out_results['PAGES'][$out_count]['ANSWER']=
-                                        $triplets_with_answer[$question];
-                                    $answer = $triplets_with_answer[$question];
-                                    if (array_key_exists(
-                                        $answer, $answer_score_map)) {
-                                        $new_score = $answer_score_map[$answer]
-                                        + $out_results['PAGES'][$out_count]
-                                            ['OUT_SCORE'];
-                                    } else {
-                                        $answer_score_map[$answer] =
-                                        $out_results['PAGES'][$out_count]
-                                            ['OUT_SCORE'];
+                                $question_phrases =
+                                    explode("|",
+                                    preg_replace("/\S+\:\S+/u", "", $phrase));
+                                foreach ($question_phrases as $question_phrase){
+                                    $question = trim(
+                                        PhraseParser::stemCharGramSegment(
+                                        $question_phrase,
+                                        L\guessLocaleFromString(
+                                        $question_phrase), true));
+                                    if (isset(
+                                        $triplets_with_answer[$question])) {
+                                        $out_results['PAGES'][$out_count][
+                                            'ANSWER']=
+                                            $triplets_with_answer[$question];
+                                        $answer =
+                                            $triplets_with_answer[$question];
+                                        if (array_key_exists(
+                                            $answer, $answer_score_map)) {
+                                            $new_score =
+                                                $answer_score_map[$answer]
+                                                + $out_results['PAGES'][
+                                                $out_count][self::SCORE];
+                                        } else {
+                                            $answer_score_map[$answer] =
+                                            $out_results['PAGES'][$out_count]
+                                                [self::SCORE];
+                                        }
+                                        break;
                                     }
                                 }
                             }
@@ -934,9 +947,9 @@ class PhraseModel extends ParallelModel
             $generated_question = $tokenizer->questionParser(
                 $phrase, $tag);
             if (!empty($generated_question['CONCISE'])) {
-                $phrase = $generated_question['CONCISE'][0];
+                $phrase = implode("|", $generated_question['CONCISE']);
             } else if (!empty($generated_question['RAW'])) {
-                $phrase = $generated_question['RAW'][0];
+                $phrase = implode("|", $generated_question['RAW']);
             }
         }
         if ($len > 0 && !preg_match("/site\:\S{5}|info\:|path\:|ip\:/",
diff --git a/src/views/elements/SearchElement.php b/src/views/elements/SearchElement.php
index 296f9e4d3..ffa6d4f91 100644
--- a/src/views/elements/SearchElement.php
+++ b/src/views/elements/SearchElement.php
@@ -144,7 +144,8 @@ class SearchElement extends Element implements CrawlConstants
         }
         if (!empty($data['BEST_ANSWER'])) {
             ?><div id="best-answer" class="echo-link">
-                 <?= $data['BEST_ANSWER']; ?>
+                 <?= tl('search_element_possible_answer') . " ".
+                    $data['BEST_ANSWER']; ?>
             </div><?php
         }
         if (empty($data['PAGES']) && empty($data['TREND_DATA']) &&
ViewGit