diff --git a/src/controllers/CrawlController.php b/src/controllers/CrawlController.php
index 8c7648989..91d5e735c 100644
--- a/src/controllers/CrawlController.php
+++ b/src/controllers/CrawlController.php
@@ -237,7 +237,7 @@ class CrawlController extends Controller implements CrawlConstants
$num = $this->clean($_REQUEST["num"], "int");
$i = $this->clean($_REQUEST["i"], "int");
$crawl_model->current_machine = $i;
- list($lookups, $exclude_fields) =
+ list($lookups, $exclude_fields, $format_words, $description_length) =
unserialize(L\webdecode($_REQUEST["arg"]));
$our_lookups = [];
foreach ($lookups as $lookup => $lookup_info) {
@@ -260,7 +260,7 @@ class CrawlController extends Controller implements CrawlConstants
}
}
$items = $crawl_model->getCrawlItems($our_lookups, null,
- $exclude_fields);
+ $exclude_fields, $format_words, $description_length);
$this->web_site->header("Content-Type: application/octet-stream");
$items["ELAPSED_TIME"] = L\changeInMicrotime($start_time);
$items = gzdeflate(serialize($items));
diff --git a/src/models/Model.php b/src/models/Model.php
index a2f00bfaf..f873655ce 100755
--- a/src/models/Model.php
+++ b/src/models/Model.php
@@ -49,7 +49,6 @@ require_once __DIR__."/../library/Utility.php";
*/
class Model implements CrawlConstants
{
- const SCORE_PRECISION = 4;
const SNIPPET_TITLE_LENGTH = 20;
const MAX_SNIPPET_TITLE_LENGTH = 20;
const SNIPPET_LENGTH_LEFT = 20;
@@ -169,9 +168,8 @@ class Model implements CrawlConstants
return file_put_contents($filename, $data);
}
/**
- * Given an array page summaries, for each summary extracts snippets which
- * are related to a set of search words. For each snippet, bold faces the
- * search terms, and then creates a new summary array.
+ * Given an array page summaries, for each summaru check if url corresponds
+ * to a search result that was human edited, if so, replace and format it.
*
* @param array $results web pages summaries (these in turn are
* arrays!)
@@ -179,7 +177,7 @@ class Model implements CrawlConstants
* @param int $description_length length of the description
* @return array summaries which have been snippified and bold faced
*/
- public function formatPageResults($results, $words = null,
+ public function addEditedPageResults($results, $words = null,
$description_length = self::DEFAULT_DESCRIPTION_LENGTH)
{
if (isset($results['PAGES'])) {
@@ -214,60 +212,76 @@ class Model implements CrawlConstants
$page[$field] = $summary[$field];
}
}
+ $page = $this->formatSinglePageResult($page, $words,
+ $description_length);
+ $pages[$i] = $page;
}
}
- if (empty($page[self::TITLE])) {
- $page[self::TITLE] = "";
- }
- $page[self::TITLE] = strip_tags($page[self::TITLE]);
- $page[self::DESCRIPTION] = strip_tags(
- preg_replace("/\<\s+([a-zA-Z])/", '<$1',
- $page[self::DESCRIPTION]));
- if (strlen($page[self::TITLE]) == 0) {
- $offset = min(mb_strlen($page[self::DESCRIPTION]),
- self::SNIPPET_TITLE_LENGTH);
- $end_title = mb_strpos($page[self::DESCRIPTION], " ", $offset);
- $ellipsis = "";
- if ($end_title > self::SNIPPET_TITLE_LENGTH) {
- $ellipsis = "...";
- if ($end_title > self::MAX_SNIPPET_TITLE_LENGTH) {
- $end_title = self::MAX_SNIPPET_TITLE_LENGTH;
- }
- }
- $page[self::TITLE] = mb_substr($page[self::DESCRIPTION], 0,
- $end_title) . $ellipsis;
- //still no text revert to url
- if (strlen($page[self::TITLE]) == 0 &&
- isset($page[self::URL])) {
- $page[self::TITLE] = $page[self::URL];
+ }
+ $output['TOTAL_ROWS'] = $results['TOTAL_ROWS'];
+ $output['PAGES'] = ($deleted_a_page) ? $pages : array_values($pages);
+ return $output;
+ }
+ /**
+ * Given a page summary, extracts snippets which
+ * are related to a set of search words. For each snippet, bold faces the
+ * search terms, and then creates a new summary array.
+ *
+ * @param array $page a single search result summary
+ * @param array $words keywords (typically what was searched on)
+ * @param int $description_length length of the description
+ * @return array $page which has been snippified and bold faced
+ */
+ public function formatSinglePageResult($page, $words = null,
+ $description_length = self::DEFAULT_DESCRIPTION_LENGTH)
+ {
+ if (empty($page[self::TITLE])) {
+ $page[self::TITLE] = "";
+ }
+ $page[self::TITLE] = strip_tags($page[self::TITLE]);
+ $page[self::DESCRIPTION] = strip_tags(
+ preg_replace("/\<\s+([a-zA-Z])/", '<$1',
+ $page[self::DESCRIPTION]));
+ if (strlen($page[self::TITLE]) == 0) {
+ $offset = min(mb_strlen($page[self::DESCRIPTION]),
+ self::SNIPPET_TITLE_LENGTH);
+ $end_title = mb_strpos($page[self::DESCRIPTION], " ", $offset);
+ $ellipsis = "";
+ if ($end_title > self::SNIPPET_TITLE_LENGTH) {
+ $ellipsis = "...";
+ if ($end_title > self::MAX_SNIPPET_TITLE_LENGTH) {
+ $end_title = self::MAX_SNIPPET_TITLE_LENGTH;
}
}
- // do a little cleaning on text
- if ($words != null) {
- $page[self::TITLE] =
- $this->boldKeywords($page[self::TITLE], $words);
- if (!isset($page[self::IS_FEED])) {
- $page[self::DESCRIPTION] =
- $this->getSnippets($page[self::DESCRIPTION],
- $words, $description_length);
- }
+ $page[self::TITLE] = mb_substr($page[self::DESCRIPTION], 0,
+ $end_title) . $ellipsis;
+ //still no text revert to url
+ if (strlen($page[self::TITLE]) == 0 &&
+ isset($page[self::URL])) {
+ $page[self::TITLE] = $page[self::URL];
+ }
+ }
+ // do a little cleaning on text
+ if ($words != null) {
+ $page[self::TITLE] =
+ $this->boldKeywords($page[self::TITLE], $words);
+ if (!isset($page[self::IS_FEED])) {
$page[self::DESCRIPTION] =
- $this->boldKeywords($page[self::DESCRIPTION], $words);
- } else {
- $page[self::DESCRIPTION] = mb_substr($page[self::DESCRIPTION],
- 0, $description_length);
+ $this->getSnippets($page[self::DESCRIPTION],
+ $words, $description_length);
}
- $pre_description = preg_replace("/\p{C}+|^[^\p{L}]+/u", "",
- $page[self::DESCRIPTION]);
- $page[self::DESCRIPTION] = (substr($pre_description,0,2) == "b>") ?
- "<" . $pre_description : $pre_description;
- $page[self::SCORE] = mb_substr($page[self::SCORE], 0,
- self::SCORE_PRECISION);
- $pages[$i] = $page;
+ $page[self::DESCRIPTION] =
+ $this->boldKeywords($page[self::DESCRIPTION], $words);
+ } else {
+ $page[self::DESCRIPTION] = mb_substr($page[self::DESCRIPTION],
+ 0, $description_length);
}
- $output['TOTAL_ROWS'] = $results['TOTAL_ROWS'];
- $output['PAGES'] = ($deleted_a_page) ? $pages : array_values($pages);
- return $output;
+ $page[self::TITLE] = trim($page[self::TITLE], " .");
+ $pre_description = preg_replace("/\p{C}+|^[^\p{L}]+/u", "",
+ $page[self::DESCRIPTION]);
+ $page[self::DESCRIPTION] = (substr($pre_description, 0, 2) == "b>") ?
+ "<" . $pre_description : $pre_description;
+ return $page;
}
/**
* Given a string, extracts a snippets of text related to a given set of
@@ -279,16 +293,30 @@ class Model implements CrawlConstants
* @param string $text haystack to extract snippet from
* @param array $words keywords used to make look in haystack
* @param string $description_length length of the description desired
- * @param bool $words_change getSnippets might be called many times on
- * the same search page with the same $words, if true then the
- * preprocessing of $words is avoided and cached versions are used
* @return string a concatenation of the extracted snippets of each word
*/
- public function getSnippets($text, $words, $description_length,
- $words_change = false)
+ public function getSnippets($text, $words, $description_length)
{
static $search_words = [];
+ static $last_words = "";
static $word_regex = "";
+ if (mb_strlen($text) < $description_length) {
+ return $text;
+ }
+ if (empty($words)) {
+ $snippet_string = mb_substr($text, 0, $description_length);
+ $rpos = strrpos($snippet_string, " ");
+ if ($rpos) {
+ $snippet_string = mb_substr($snippet_string, 0, $rpos);
+ }
+ return $snippet_string;
+ }
+ $word_string = implode(" ", $words);
+ $words_change = false;
+ if ($word_string != $last_words) {
+ $words_change = true;
+ $last_words = $word_string;
+ }
$start_regex = "/";
$left = self::SNIPPET_LENGTH_LEFT;
$left3 = $left - 3;
@@ -297,16 +325,11 @@ class Model implements CrawlConstants
$start_regex2 = "/\b(\w{3}.{0,$left3})?(?:(?:";
$end_regex = "/ui";
$end_regex2 = ").{0,$right}\b)+/ui";
- if (mb_strlen($text) < $description_length) {
- return $text;
- }
$ellipsis = "";
if ($words_change || empty($search_words)) {
- $search_words = [];
- foreach ($words as $word) {
- $search_words = array_merge($search_words, explode(" ", $word));
- }
- $search_words = array_filter(array_unique($search_words));
+ // orginal list of words might have had space separated phrases;
+ $search_words = array_filter(array_unique(
+ explode(" ", $word_string)));
$word_regex = "";
$delim = "";
foreach ($search_words as $word) {
@@ -322,41 +345,40 @@ class Model implements CrawlConstants
$len = mb_strlen($text_source);
$offset = 0;
if ($len < self::MIN_SNIPPET_LENGTH) {
- if (preg_match($start_regex . $word_regex.
+ if (preg_match($start_regex . $word_regex .
$end_regex, $text_source, $match)) {
if (stristr($snippet_string, $text_source) === false) {
- $snippet_string .= $ellipsis. $text_source;
+ $snippet_string .= $ellipsis . $text_source;
$ellipsis = " ... ";
- if (mb_strlen($snippet_string) >= $description_length) {
- break;
- }
}
}
- continue;
- }
- $word_locations = [];
- preg_match_all($start_regex2 . $word_regex . $end_regex2,
- $text_source, $matches);
- if (isset($matches[0])) {
- $seen_match = [];
- foreach ($matches[0] as $match) {
- if ($match >= $description_length) {
- $match = mb_substr($match, 0, $description_length);
- $rpos = strrpos($match, " ");
- if ($rpos) {
- $match = mb_substr($match, 0, $rpos);
- }
- }
- $match = trim($match, ".");
- if (stristr($snippet_string, $match) === false) {
- $snippet_string .= $ellipsis. $match;
- $ellipsis = " ... ";
- if (mb_strlen($snippet_string) >= $description_length) {
- break;
+ } else {
+ preg_match_all($start_regex2 . $word_regex . $end_regex2,
+ $text_source, $matches);
+ if (isset($matches[0])) {
+ $seen_match = [];
+ foreach ($matches[0] as $match) {
+ $match = trim($match, ".");
+ if (stristr($snippet_string, $match) === false) {
+ $snippet_string .= $ellipsis. $match;
+ $ellipsis = " ... ";
+ if (mb_strlen($snippet_string) >=
+ $description_length) {
+ break;
+ }
}
}
}
}
+ if (mb_strlen($snippet_string) >= $description_length) {
+ $snippet_string = mb_substr($snippet_string, 0,
+ $description_length);
+ $rpos = strrpos($snippet_string, " ");
+ if ($rpos) {
+ $snippet_string = mb_substr($snippet_string, 0, $rpos);
+ }
+ break;
+ }
}
return $snippet_string;
}
@@ -383,7 +405,7 @@ class Model implements CrawlConstants
/**
* Gets a list of all DBMS that work with the search engine
*
- * @return array Names of availabledatasources
+ * @return array Names of available data sources
*/
public function getDbmsList()
{
diff --git a/src/models/ParallelModel.php b/src/models/ParallelModel.php
index 1b97675d4..51269b15d 100755
--- a/src/models/ParallelModel.php
+++ b/src/models/ParallelModel.php
@@ -110,17 +110,22 @@ class ParallelModel extends Model
* the crawlItem but which should be excluded from the result.
* This will make the result smaller and so hopefully faster to
* transmit
+ * @param array $format_words words which should be highlighted in
+ * search snippets returned
+ * @param int $description_length length of snippets to be returned
+ * for each search result
* @return array of summary data for the matching documents
*/
public function getCrawlItems($lookups, $machine_urls = null,
- $exclude_fields = [])
+ $exclude_fields = [], $format_words = null,
+ $description_length = self::DEFAULT_DESCRIPTION_LENGTH)
{
if (!empty($machine_urls) && !$this->isSingleLocalhost($machine_urls)) {
$summaries = $this->networkGetCrawlItems($lookups, $machine_urls,
- $exclude_fields);
+ $exclude_fields, $format_words, $description_length);
} else {
$summaries = $this->nonNetworkGetCrawlItems($lookups,
- $exclude_fields);
+ $exclude_fields, $format_words, $description_length);
}
return $summaries;
}
@@ -129,7 +134,7 @@ class ParallelModel extends Model
* by their url, or by group of 5-tuples of the form
* (machine, key, index, generation, offset). This makes an execMachines
* call to make a network request to the CrawlController's on each machine
- * which in turn calls getCrawlItems (and thence nonNetworkGetCrawlItems)
+ * which in turn calls getCrawlItems (and thence nonNetworkGetCrawlItems)
* on each machine. The results are then sent back to networkGetCrawlItems
* and aggregated.
*
@@ -139,10 +144,15 @@ class ParallelModel extends Model
* the crawlItem but which should be excluded from the result.
* This will make the result smaller and so hopefully faster to
* transmit
+ * @param array $format_words words which should be highlighted in
+ * search snippets returned
+ * @param int $description_length length of snippets to be returned
+ * for each search result
* @return array of summary data for the matching documents
*/
public function networkGetCrawlItems($lookups, $machine_urls,
- $exclude_fields = [])
+ $exclude_fields = [], $format_words = null, $description_length =
+ self::DEFAULT_DESCRIPTION_LENGTH)
{
//Set-up network request
$machines = [];
@@ -169,7 +179,8 @@ class ParallelModel extends Model
}
//Make request
$page_set = $this->execMachines("getCrawlItems",
- $machines, serialize([$lookups, $exclude_fields]), $num_machines);
+ $machines, serialize([$lookups, $exclude_fields,
+ $format_words, $description_length]), $num_machines);
//Aggregate results
$summaries = [];
$elapsed_times = [];
@@ -239,9 +250,15 @@ class ParallelModel extends Model
* the crawlItem but which should be excluded from the result.
* This will make the result smaller and so hopefully faster to
* transmit
+ * @param array $format_words words which should be highlighted in
+ * search snippets returned
+ * @param int $description_length length of snippets to be returned
+ * for each search result
* @return array of summary data for the matching documents
*/
- public function nonNetworkGetCrawlItems($lookups, $exclude_fields = [])
+ public function nonNetworkGetCrawlItems($lookups, $exclude_fields = [],
+ $format_words = null, $description_length =
+ self::DEFAULT_DESCRIPTION_LENGTH)
{
$summary_offset = null;
$generation = null;
@@ -377,6 +394,13 @@ class ParallelModel extends Model
unset($summaries[$key][$exclude_field]);
}
}
+ if ($format_words !== null && count($summaries) > 0 &&
+ $description_length > 0) {
+ foreach ($summaries as $key => $summary) {
+ $summaries[$key] = $this->formatSinglePageResult($summary,
+ $format_words, $description_length);
+ }
+ }
return $summaries;
}
/**
@@ -409,7 +433,7 @@ class ParallelModel extends Model
}
$num_generations = $index_archive->generation_info['ACTIVE'];
$hash_key = ($is_key) ? L\crawlHashWord($url_or_key, true) :
- L\crawlHashWord("info:".$url_or_key, true);
+ L\crawlHashWord("info:" . $url_or_key, true);
$info = IndexManager::getWordInfo($index_name, $hash_key, 0, 1);
if (!isset($info[0][4])) {
return false;
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index 2fa807312..f9f5158b6 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -377,6 +377,13 @@ class PhraseModel extends ParallelModel
}
}
}
+ if ($format) {
+ if (count($format_words) == 0) {
+ $format_words = null;
+ }
+ } else {
+ $format_words = null;
+ }
if (C\QUERY_STATISTICS) {
$this->query_info['QUERY'] .=
"$in2<b>Presentation Parse time</b>: " .
@@ -390,7 +397,7 @@ class PhraseModel extends ParallelModel
$out_results = $this->getSummariesByHash($word_structs,
$low, $phrase_num, $filter, $use_cache_if_allowed, $raw,
$queue_servers, $phrase, $save_timestamp_name,
- $limit_feeds);
+ $limit_feeds, $format_words);
if (isset($out_results['PAGES']) &&
count($out_results['PAGES']) != 0) {
$out_count = 0;
@@ -465,38 +472,15 @@ class PhraseModel extends ParallelModel
} elseif (isset($results['PAGES'])) {
$results['TOTAL_ROWS'] = count($results['PAGES']);
}
- if ($format) {
- if (count($format_words) == 0) {
- $format_words = null;
- }
- } else {
- $format_words = null;
- }
- $description_length = self::DEFAULT_DESCRIPTION_LENGTH;
- /* additional meta word come from indexing plugins which might need
- longer description lengths, say for recipes
- */
- if (isset($this->additional_meta_words) &&
- is_array($this->additional_meta_words)) {
- foreach ($this->additional_meta_words as $meta_word => $length) {
- $pattern = "/$meta_word/";
- if (preg_match($pattern, $input_phrase)) {
- $description_length = $length;
- break; // only match the first found
- }
- }
- }
if ($raw == 0 && isset($results['TOTAL_ROWS']) &&
$results['TOTAL_ROWS'] > 0) {
- $output = $this->formatPageResults($results, $format_words,
+ $results = $this->addEditedPageResults($results, $format_words,
$description_length);
if (!empty($answer_score_map)) {
arsort($answer_score_map);
reset($answer_score_map);
- $output['BEST_ANSWER'] = key($answer_score_map);
+ $results['BEST_ANSWER'] = key($answer_score_map);
}
- } else {
- $output = $results;
}
if (C\QUERY_STATISTICS) {
$this->query_info['QUERY'] .= "<b>Format Time</b>: ".
@@ -506,7 +490,7 @@ class PhraseModel extends ParallelModel
$this->db->total_time += $this->query_info['ELAPSED_TIME'];
$this->db->query_log[] = $this->query_info;
}
- return $output;
+ return $results;
}
/**
* Parses from a string phrase representing a conjunctive query, a struct
@@ -1045,13 +1029,14 @@ class PhraseModel extends ParallelModel
* docs after $save_timestamp 's previous iterate position.
* @param bool $limit_feeds if true the number of feed shard items to
* allow in search results is limited to WordIterator::LIMIT_FEEDS_COUNT
- *
+ * @param array $format_words words which should be highlighted in
+ * search snippets returned
* @return array document summaries
*/
public function getSummariesByHash($word_structs, $limit, $num, &$filter,
$use_cache_if_allowed = true, $raw = 0, $queue_servers = [],
$original_query = "", $save_timestamp_name = "",
- $limit_feeds = true)
+ $limit_feeds = true, $format_words = null)
{
$indent= " ";
$in2 = $indent . $indent;
@@ -1314,10 +1299,26 @@ class PhraseModel extends ParallelModel
$out_pages = [];
$cur_limit = $start_slice;
$with_qa = (preg_match("/\bqqq\b/i", $original_query)) ? true : false;
+ // now calculate snippet length
+ $description_length = self::DEFAULT_DESCRIPTION_LENGTH;
+ /* additional meta word come from indexing plugins which might need
+ longer description lengths, say for recipes
+ */
+ if (isset($this->additional_meta_words) &&
+ is_array($this->additional_meta_words)) {
+ foreach ($this->additional_meta_words as $meta_word => $length) {
+ $pattern = "/$meta_word/";
+ if (preg_match($pattern, $original_query)) {
+ $description_length = $length;
+ break; // only match the first found
+ }
+ }
+ }
while (count($out_pages) < $to_get_count && $get_pages) {
$out_pages = array_merge($out_pages,
$this->getSummariesFromOffsets($get_pages, $queue_servers,
- $raw, $groups_with_docs, $with_qa));
+ $raw, $groups_with_docs, $with_qa, $format_words,
+ $description_length));
if ($save_timestamp_name != "") {
break;
}
@@ -1383,11 +1384,24 @@ class PhraseModel extends ParallelModel
* contain at least one doc as opposed to a groups with only links
* @param bool $with_question_answer_info whether question answer info
* in summaries needs to be returned
+ * @param array $format_words words which should be highlighted in
+ * search snippets returned
+ * @param int $description_length length of snippets to be returned
+ * for each search result
* @return array pages with summaries added
*/
- public function getSummariesFromOffsets(&$pages, &$queue_servers, $raw,
- $groups_with_docs, $with_question_answer_info)
+ public function getSummariesFromOffsets(&$pages, &$queue_servers,
+ $raw, $groups_with_docs, $with_question_answer_info,
+ $format_words = null, $description_length =
+ self::DEFAULT_DESCRIPTION_LENGTH)
{
+ if ($raw != 0) {
+ $format_words = null;
+ } else {
+ if ($format_words == null) {
+ $format_words = [];
+ }
+ }
$lookups = [];
$summary_exclude_fields = [self::HEADER, self::PAGE, self::LINKS,
self::DESCRIPTION_SCORES];
@@ -1429,7 +1443,7 @@ class PhraseModel extends ParallelModel
/* look up items (items we have a link summary for, but not doc
summary)*/
$summaries = $this->getCrawlItems($lookups, $lookup_queue_servers,
- $summary_exclude_fields);
+ $summary_exclude_fields, $format_words, $description_length);
$lookups = [];
// link summaries we want to remember in case don't have doc summary
$link_summaries = [];
@@ -1459,7 +1473,7 @@ class PhraseModel extends ParallelModel
}
// lookup redirects
$loc_summaries = $this->getCrawlItems($lookups, $lookup_queue_servers,
- $summary_exclude_fields);
+ $summary_exclude_fields, $format_words, $description_length);
// delete summaries we found from $link_summaries
if (is_array($loc_summaries)) {
$loc_hashes = array_keys($loc_summaries);
diff --git a/src/views/SearchView.php b/src/views/SearchView.php
index 308df2155..d5ab36eee 100755
--- a/src/views/SearchView.php
+++ b/src/views/SearchView.php
@@ -52,6 +52,10 @@ class SearchView extends View implements CrawlConstants
* Represent extension of Git urls
*/
const GIT_EXTENSION = ".git";
+ /**
+ * Number of decimals for search result scores
+ */
+ const SCORE_PRECISION = 4;
/**
* Draws the main landing pages as well as search result pages
*
@@ -440,8 +444,8 @@ class SearchView extends View implements CrawlConstants
e($label . ":" . number_format($score, 2) . "\n");
}
}
- ?>" ><?=tl('search_view_score', $page[self::SCORE]) ?></span>
- <?php
+ ?>" ><?=tl('search_view_score',number_format($page[self::SCORE],
+ self::SCORE_PRECISION))?></span><?php
}
?>
</p>