diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php
index 1080a47a7..bb3ac901c 100755
--- a/src/controllers/SearchController.php
+++ b/src/controllers/SearchController.php
@@ -170,7 +170,6 @@ class SearchController extends Controller implements CrawlConstants
if (isset($data["PAGES"])) {
$count = count($data["PAGES"]);
for ($i = 0; $i < $count; $i++) {
- unset($data["PAGES"][$i]["OUT_SCORE"]);
$data["PAGES"][$i][self::SCORE]= "".
round($data["PAGES"][$i][self::SCORE], 3);
$data["PAGES"][$i][self::DOC_RANK]= "".
diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php
index b571f140b..bda6ff06b 100755
--- a/src/library/processors/HtmlProcessor.php
+++ b/src/library/processors/HtmlProcessor.php
@@ -273,7 +273,7 @@ class HtmlProcessor extends TextProcessor
$out_links = [];
foreach ($links as $link_url => $link_text) {
// Avoid redirects in top-level links
- if (preg_match("/^Location/i", $link_text)) {
+ if (preg_match("/^(Location|http)/i", $link_text)) {
continue;
}
$cld = UrlParser::getCompanyLevelDomain($url);
diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php
index 3c104d8b6..afe21e5d9 100755
--- a/src/locale/en_US/resources/Tokenizer.php
+++ b/src/locale/en_US/resources/Tokenizer.php
@@ -387,7 +387,9 @@ class Tokenizer
$current = ['token' => $token, 'tag' => 'NN'];
// remove trailing full stops
$token = mb_strtolower($token);
- if (!empty($dictionary[$token])) {
+ if ($token == "howto") {
+ $current['tag'] = 'WP';
+ } else if (!empty($dictionary[$token])) {
$tag_list = explode(" ", $dictionary[$token]);
$current['tag'] = $tag_list[0];
}
@@ -993,6 +995,7 @@ class Tokenizer
$word_and_phrase = preg_replace("/[\{\[\(][^\}\]\)]+[\}\]\)]/u",
"", $word_and_phrase);
$tagged_phrase = self::tagTokenizePartOfSpeech($word_and_phrase);
+ print_r($tagged_phrase);
$parse_tree = self::parseWholePhrase($tagged_phrase,
['cur_node' => 0]);
$triplets = self::extractTripletsParseTree($parse_tree);
@@ -1221,6 +1224,7 @@ class Tokenizer
*/
public static function questionParser($question)
{
+ $question = preg_replace("/how\s+to/", "howto", $question);
$tagged_question = self::tagTokenizePartOfSpeech($question);
$generated_questions = [];
if (isset($tagged_question[0])) {
@@ -1230,7 +1234,8 @@ class Tokenizer
if ($token == "WHO") {
$generated_questions = self::parseWhoQuestion(
$tagged_question, 1);
- } else if (in_array($token, ["WHERE", "WHEN", "WHAT"])) {
+ } else if (in_array($token, ["WHERE", "WHEN", "WHAT",
+ "HOW", "HOWTO"])) {
$generated_questions = self::parseWHPlusQuestion(
$tagged_question, 1);
}
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index 4d3f381e9..57185efee 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -368,6 +368,7 @@ class PhraseModel extends ParallelModel
}
}
if ($guess_semantics) {
+ $original_has_disjuncts = (strpos($phrase, "|") !== false);
$repeat_check = [];
$phrase = "";
$delim = " ";
@@ -432,28 +433,40 @@ class PhraseModel extends ParallelModel
$i < min($bound[0] + $bound[1], $results_high);
$i++) {
if (isset($out_results['PAGES'][$out_count])) {
- if (!strstr($phrase, "|") &&
+ if (empty($original_has_disjuncts) &&
isset($out_results['PAGES'][$out_count]
[self::QUESTION_ANSWERS])) {
$triplets_with_answer =
$out_results['PAGES'][$out_count]
[self::QUESTION_ANSWERS];
- $question = trim(
- PhraseParser::stemCharGramSegment($phrase,
- L\guessLocaleFromString($phrase), true));
- if (isset($triplets_with_answer[$question])) {
- $out_results['PAGES'][$out_count]['ANSWER']=
- $triplets_with_answer[$question];
- $answer = $triplets_with_answer[$question];
- if (array_key_exists(
- $answer, $answer_score_map)) {
- $new_score = $answer_score_map[$answer]
- + $out_results['PAGES'][$out_count]
- ['OUT_SCORE'];
- } else {
- $answer_score_map[$answer] =
- $out_results['PAGES'][$out_count]
- ['OUT_SCORE'];
+ $question_phrases =
+ explode("|",
+ preg_replace("/\S+\:\S+/u", "", $phrase));
+ foreach ($question_phrases as $question_phrase){
+ $question = trim(
+ PhraseParser::stemCharGramSegment(
+ $question_phrase,
+ L\guessLocaleFromString(
+ $question_phrase), true));
+ if (isset(
+ $triplets_with_answer[$question])) {
+ $out_results['PAGES'][$out_count][
+ 'ANSWER']=
+ $triplets_with_answer[$question];
+ $answer =
+ $triplets_with_answer[$question];
+ if (array_key_exists(
+ $answer, $answer_score_map)) {
+ $new_score =
+ $answer_score_map[$answer]
+ + $out_results['PAGES'][
+ $out_count][self::SCORE];
+ } else {
+ $answer_score_map[$answer] =
+ $out_results['PAGES'][$out_count]
+ [self::SCORE];
+ }
+ break;
}
}
}
@@ -934,9 +947,9 @@ class PhraseModel extends ParallelModel
$generated_question = $tokenizer->questionParser(
$phrase, $tag);
if (!empty($generated_question['CONCISE'])) {
- $phrase = $generated_question['CONCISE'][0];
+ $phrase = implode("|", $generated_question['CONCISE']);
} else if (!empty($generated_question['RAW'])) {
- $phrase = $generated_question['RAW'][0];
+ $phrase = implode("|", $generated_question['RAW']);
}
}
if ($len > 0 && !preg_match("/site\:\S{5}|info\:|path\:|ip\:/",
diff --git a/src/views/elements/SearchElement.php b/src/views/elements/SearchElement.php
index 296f9e4d3..ffa6d4f91 100644
--- a/src/views/elements/SearchElement.php
+++ b/src/views/elements/SearchElement.php
@@ -144,7 +144,8 @@ class SearchElement extends Element implements CrawlConstants
}
if (!empty($data['BEST_ANSWER'])) {
?><div id="best-answer" class="echo-link">
- <?= $data['BEST_ANSWER']; ?>
+ <?= tl('search_element_possible_answer') . " ".
+ $data['BEST_ANSWER']; ?>
</div><?php
}
if (empty($data['PAGES']) && empty($data['TREND_DATA']) &&