diff --git a/controllers/search_controller.php b/controllers/search_controller.php index f5c57c676..6ba614c96 100755 --- a/controllers/search_controller.php +++ b/controllers/search_controller.php @@ -1016,10 +1016,11 @@ class SearchController extends Controller implements CrawlConstants $first_child = $body->firstChild; // add information about what was extracted from page + $text_align = (getLocaleDirection() == 'ltr') ? "left" : "right"; $summaryNode = $dom->createElement('pre'); $summaryNode = $body->insertBefore($summaryNode, $first_child); $summaryNode->setAttributeNS("","style", "border-color: black; ". - "border-style:solid; border-width:3px; ". + "border-style:solid; border-width:3px; text-align:$text_align;". "padding: 5px; background-color: white; display:none;"); $summaryNode->setAttributeNS("","id", "summary-page-id"); @@ -1035,7 +1036,6 @@ class SearchController extends Controller implements CrawlConstants $textNode = $dom->createTextNode("var summaryShow = 'none';"); $scriptNode->appendChild($textNode); - $text_align = (getLocaleDirection() == 'ltr') ? "left" : "right"; $aDivNode = $dom->createElement('div'); $aDivNode = $body->insertBefore($aDivNode, $summaryNode); $aDivNode->setAttributeNS("","style", "border-color: black; ". diff --git a/models/model.php b/models/model.php index 4d38ec460..0fc3383c2 100755 --- a/models/model.php +++ b/models/model.php @@ -224,12 +224,17 @@ class Model implements CrawlConstants */ function getSnippets($text, $words, $description_length) { + if(strlen($text) < $description_length) { + return $text; + } + $ellipsis = ""; $len = mb_strlen($text); $offset = 0; $words = array_unique($words); $words = array_filter($words); $snippet_string = ""; + $snippet_hash = array(); $i = 0; do { @@ -261,23 +266,18 @@ class Model implements CrawlConstants if($high > $pre_high + 10){ $high = $pre_high; } - $snippet_string .= $ellipsis. - mb_substr($text, $low, $high - $low); - $ellipsis = " ... "; + $cur_snippet = trim(mb_substr($text, $low, $high - $low)); + if(!isset($snippet_hash[$cur_snippet])) { + $snippet_string .= $ellipsis. $cur_snippet; + $ellipsis = " ... "; + $snippet_hash[$cur_snippet] = true; + } if(strlen($snippet_string) >= $description_length) break 2; } $words = array_values($word_locations); $offset = $new_offset + 1; } while($offset < $len); - - if(strlen($snippet_string) < MIN_SNIPPET_LENGTH) { - $snippet_string = substr($text, 0, $description_length); - if($high = mb_strripos($snippet_string, " ")) { - $snippet_string = substr($text, 0, $high); - } - } - return $snippet_string; } diff --git a/models/parallel_model.php b/models/parallel_model.php index ecb39d61f..5891434e4 100755 --- a/models/parallel_model.php +++ b/models/parallel_model.php @@ -124,60 +124,108 @@ class ParallelModel extends Model implements CrawlConstants */ function getCrawlItems($lookups, $machine_urls = NULL) { - $summaries = array(); if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) { - $num_machines = count($machine_urls); - $machines = array(); - foreach($lookups as $lookup => $lookup_info) { - if(count($lookup_info) == 2 && $lookup_info[0][0] === 'h') { - list($url, $index_name) = $lookup_info; - $index = calculatePartition($url, $num_machines, - "UrlParser::getHost"); - $machines[$index] = $machine_urls[$index]; - } else { - foreach($lookup_info as $lookup_item) { - if(count($lookup_item) == 5) { - list($index, , , , ) = $lookup_item; - $machines[$index] = $machine_urls[$index]; - } else { - $machines = $machine_urls; - break; - } + $summaries = $this->networkGetCrawlItems($lookups, $machine_urls); + } else { + $summaries = $this->nonNetworkGetCrawlItems($lookups); + } + return $summaries; + } + + /** + * In a multiple queue server setting, gets summaries for a set of document + * by their url, or by group of 5-tuples of the form + * (machine, key, index, generation, offset). This makes an execMachines + * call to make a network request to the CrawlController's on each machine + * which in turn calls getCrawlItems (and thence nonNetworkGetCrawlItems) + * on each machine. The results are then sent back to networkGetCrawlItems + * and aggregated. + * + * @param string $lookups things whose summaries we are trying to look up + * @param array $machine_urls an array of urls of yioop queue servers + * @return array of summary data for the matching documents + */ + function networkGetCrawlItems($lookups, $machine_urls) + { + //Set-up network request + $num_machines = count($machine_urls); + $machines = array(); + foreach($lookups as $lookup => $lookup_info) { + if(count($lookup_info) == 2 && $lookup_info[0][0] === 'h') { + list($url, $index_name) = $lookup_info; + $index = calculatePartition($url, $num_machines, + "UrlParser::getHost"); + $machines[$index] = $machine_urls[$index]; + } else { + foreach($lookup_info as $lookup_item) { + if(count($lookup_item) == 5) { + list($index, , , , ) = $lookup_item; + $machines[$index] = $machine_urls[$index]; + } else { + $machines = $machine_urls; + break; } } - } - $page_set = $this->execMachines("getCrawlItems", - $machines, serialize($lookups), $num_machines); + + } + //Make request + $page_set = $this->execMachines("getCrawlItems", + $machines, serialize($lookups), $num_machines); - if(is_array($page_set)) { - foreach($page_set as $elt) { - $result = unserialize(webdecode($elt[self::PAGE])); - if(!is_array($result)) continue; - foreach($result as $lookup => $summary) { - if(isset($summaries[$lookup])) { - if(isset($summary[self::DESCRIPTION])) { - if(!isset($summaries[$lookup][ - self::DESCRIPTION])){ - $summaries[$lookup][self::DESCRIPTION] = ""; - } + //Aggregate results + $summaries = array(); + if(is_array($page_set)) { + foreach($page_set as $elt) { + $description_hash = array(); + $result = unserialize(webdecode($elt[self::PAGE])); + if(!is_array($result)) continue; + foreach($result as $lookup => $summary) { + if(isset($summaries[$lookup])) { + if(isset($summary[self::DESCRIPTION])) { + $description = trim($summary[self::DESCRIPTION]); + if(!isset($summaries[$lookup][self::DESCRIPTION])){ + $summaries[$lookup][self::DESCRIPTION] = ""; + } + if(!isset($description_hash[$description])){ $summaries[$lookup][self::DESCRIPTION] = " .. ". - $summary[self::DESCRIPTION]; + $description; + $description_hash[$description] = true; } - foreach($summary as $attr => $value){ - if($attr !=self::DESCRIPTION && - !isset($summaries[$lookup][$attr])) { - $summaries[$lookup][$attr] = $value; - } + } + foreach($summary as $attr => $value){ + if($attr !=self::DESCRIPTION && + !isset($summaries[$lookup][$attr])) { + $summaries[$lookup][$attr] = $value; } - } else { - $summaries[$lookup] = $summary; } + } else { + $summaries[$lookup] = $summary; } } } - return $summaries; } + return $summaries; + } + + /** + * Gets summaries on a particular machine for a set of document by + * their url, or by group of 5-tuples of the form + * (machine, key, index, generation, offset) + * This may be used in either the single queue_server setting or + * it may be called indirectly by a particular machine's + * CrawlController as part of fufilling a network-based getCrawlItems + * request. $lookups contains items which are to be grouped (as came + * from same url or site with the same cache). So this function aggregates + * their descriptions. + * + * @param string $lookups things whose summaries we are trying to look up + * @param array $machine_urls an array of urls of yioop queue servers + * @return array of summary data for the matching documents + */ + function nonNetworkGetCrawlItems($lookups) + { + $summaries = array(); foreach($lookups as $lookup => $lookup_info) { if(count($lookup_info) == 2 && $lookup_info[0][0] === 'h') { list($url, $index_name) = $lookup_info; @@ -188,6 +236,7 @@ class ParallelModel extends Model implements CrawlConstants $index_archive->getPage($summary_offset, $generation); } else { $summary = array(); + $description_hash = array(); foreach($lookup_info as $lookup_item) { if(count($lookup_item) == 2) { list($word_key, $index_name) = $lookup_item; @@ -207,27 +256,31 @@ class ParallelModel extends Model implements CrawlConstants $index->setCurrentShard($generation, true); $page = @$index->getPage($summary_offset); if(!$page || $page == array()) {continue;} - $ellipsis_used = false; $copy = false; if($summary == array()) { + if(isset($page[self::DESCRIPTION])) { + $description = trim($page[self::DESCRIPTION]); + $page[self::DESCRIPTION] = $description; + $description_hash[$description] = true; + } $summary = $page; } else if (isset($page[self::DESCRIPTION])) { + $description = trim($page[self::DESCRIPTION]); if(!isset($summary[self::DESCRIPTION])) { $summary[ self::DESCRIPTION] = ""; } - $summary[self::DESCRIPTION].= - " .. ".$page[self::DESCRIPTION]; - $ellipsis_used = true; + if(!isset($description_hash[$description])){ + $summary[self::DESCRIPTION] .= + " .. ".$description; + $description_hash[$description] = true; + } $copy = true; } else { $copy = true; } - if($ellipsis_used && strlen($summary[self::DESCRIPTION]) > + if(strlen($summary[self::DESCRIPTION]) > self::MIN_DESCRIPTION_LENGTH) { - /* want at least one ellipsis in case terms only - appear in links - */ break; } if($copy) {