Stab at reducing the number of duplicate snippets in serps, a=chris

Chris Pollett [2012-06-23 19:Jun:rd]
Stab at reducing the number of duplicate snippets in serps, a=chris
Filename
controllers/search_controller.php
models/model.php
models/parallel_model.php
diff --git a/controllers/search_controller.php b/controllers/search_controller.php
index f5c57c676..6ba614c96 100755
--- a/controllers/search_controller.php
+++ b/controllers/search_controller.php
@@ -1016,10 +1016,11 @@ class SearchController extends Controller implements CrawlConstants
         $first_child = $body->firstChild;

         // add information about what was extracted from page
+        $text_align = (getLocaleDirection() == 'ltr') ? "left" : "right";
         $summaryNode = $dom->createElement('pre');
         $summaryNode = $body->insertBefore($summaryNode, $first_child);
         $summaryNode->setAttributeNS("","style", "border-color: black; ".
-            "border-style:solid; border-width:3px; ".
+            "border-style:solid; border-width:3px; text-align:$text_align;".
             "padding: 5px; background-color: white; display:none;");
         $summaryNode->setAttributeNS("","id", "summary-page-id");

@@ -1035,7 +1036,6 @@ class SearchController extends Controller implements CrawlConstants
         $textNode = $dom->createTextNode("var summaryShow = 'none';");
         $scriptNode->appendChild($textNode);

-        $text_align = (getLocaleDirection() == 'ltr') ? "left" : "right";
         $aDivNode = $dom->createElement('div');
         $aDivNode = $body->insertBefore($aDivNode, $summaryNode);
         $aDivNode->setAttributeNS("","style", "border-color: black; ".
diff --git a/models/model.php b/models/model.php
index 4d38ec460..0fc3383c2 100755
--- a/models/model.php
+++ b/models/model.php
@@ -224,12 +224,17 @@ class Model implements CrawlConstants
      */
     function getSnippets($text, $words, $description_length)
     {
+        if(strlen($text) < $description_length) {
+            return $text;
+        }
+
         $ellipsis = "";
         $len = mb_strlen($text);
         $offset = 0;
         $words = array_unique($words);
         $words = array_filter($words);
         $snippet_string = "";
+        $snippet_hash = array();
         $i = 0;
         do
         {
@@ -261,23 +266,18 @@ class Model implements CrawlConstants
                 if($high > $pre_high + 10){
                     $high = $pre_high;
                 }
-                $snippet_string .= $ellipsis.
-                     mb_substr($text, $low, $high - $low);
-                $ellipsis = " ... ";
+                $cur_snippet = trim(mb_substr($text, $low, $high - $low));
+                if(!isset($snippet_hash[$cur_snippet])) {
+                    $snippet_string .= $ellipsis. $cur_snippet;
+                    $ellipsis = " ... ";
+                    $snippet_hash[$cur_snippet] = true;
+                }
                 if(strlen($snippet_string) >= $description_length) break 2;
             }
             $words = array_values($word_locations);
             $offset = $new_offset + 1;
         } while($offset < $len);

-
-        if(strlen($snippet_string) < MIN_SNIPPET_LENGTH) {
-            $snippet_string = substr($text, 0, $description_length);
-            if($high = mb_strripos($snippet_string, " ")) {
-                $snippet_string = substr($text, 0, $high);
-            }
-        }
-
         return $snippet_string;
     }

diff --git a/models/parallel_model.php b/models/parallel_model.php
index ecb39d61f..5891434e4 100755
--- a/models/parallel_model.php
+++ b/models/parallel_model.php
@@ -124,60 +124,108 @@ class ParallelModel extends Model implements CrawlConstants
      */
     function getCrawlItems($lookups, $machine_urls = NULL)
     {
-        $summaries = array();
         if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) {
-            $num_machines = count($machine_urls);
-            $machines = array();
-            foreach($lookups as $lookup => $lookup_info) {
-                if(count($lookup_info) == 2 && $lookup_info[0][0] === 'h') {
-                    list($url, $index_name) = $lookup_info;
-                    $index = calculatePartition($url, $num_machines,
-                        "UrlParser::getHost");
-                    $machines[$index] = $machine_urls[$index];
-                } else {
-                    foreach($lookup_info as $lookup_item) {
-                        if(count($lookup_item) == 5) {
-                            list($index, , , , ) = $lookup_item;
-                            $machines[$index] = $machine_urls[$index];
-                        } else {
-                            $machines = $machine_urls;
-                            break;
-                        }
+            $summaries = $this->networkGetCrawlItems($lookups, $machine_urls);
+        } else {
+            $summaries = $this->nonNetworkGetCrawlItems($lookups);
+        }
+        return $summaries;
+    }
+
+    /**
+     * In a multiple queue server setting, gets summaries for a set of document
+     * by their url, or by group of 5-tuples of the form
+     * (machine, key, index, generation, offset). This makes an execMachines
+     * call to make a network request to the CrawlController's on each machine
+     * which in turn calls getCrawlItems (and thence nonNetworkGetCrawlItems)
+     * on each machine. The results are then sent back to networkGetCrawlItems
+     * and aggregated.
+     *
+     * @param string $lookups things whose summaries we are trying to look up
+     * @param array $machine_urls an array of urls of yioop queue servers
+     * @return array of summary data for the matching documents
+     */
+    function networkGetCrawlItems($lookups, $machine_urls)
+    {
+        //Set-up network request
+        $num_machines = count($machine_urls);
+        $machines = array();
+        foreach($lookups as $lookup => $lookup_info) {
+            if(count($lookup_info) == 2 && $lookup_info[0][0] === 'h') {
+                list($url, $index_name) = $lookup_info;
+                $index = calculatePartition($url, $num_machines,
+                    "UrlParser::getHost");
+                $machines[$index] = $machine_urls[$index];
+            } else {
+                foreach($lookup_info as $lookup_item) {
+                    if(count($lookup_item) == 5) {
+                        list($index, , , , ) = $lookup_item;
+                        $machines[$index] = $machine_urls[$index];
+                    } else {
+                        $machines = $machine_urls;
+                        break;
                     }
                 }
-
             }
-            $page_set = $this->execMachines("getCrawlItems",
-                $machines, serialize($lookups), $num_machines);
+
+        }
+        //Make request
+        $page_set = $this->execMachines("getCrawlItems",
+            $machines, serialize($lookups), $num_machines);

-            if(is_array($page_set)) {
-                foreach($page_set as $elt) {
-                    $result = unserialize(webdecode($elt[self::PAGE]));
-                    if(!is_array($result)) continue;
-                    foreach($result as $lookup => $summary) {
-                        if(isset($summaries[$lookup])) {
-                            if(isset($summary[self::DESCRIPTION])) {
-                                if(!isset($summaries[$lookup][
-                                    self::DESCRIPTION])){
-                                    $summaries[$lookup][self::DESCRIPTION] = "";
-                                }
+        //Aggregate results
+        $summaries = array();
+        if(is_array($page_set)) {
+            foreach($page_set as $elt) {
+                $description_hash = array();
+                $result = unserialize(webdecode($elt[self::PAGE]));
+                if(!is_array($result)) continue;
+                foreach($result as $lookup => $summary) {
+                    if(isset($summaries[$lookup])) {
+                        if(isset($summary[self::DESCRIPTION])) {
+                            $description = trim($summary[self::DESCRIPTION]);
+                            if(!isset($summaries[$lookup][self::DESCRIPTION])){
+                                $summaries[$lookup][self::DESCRIPTION] = "";
+                            }
+                            if(!isset($description_hash[$description])){
                                 $summaries[$lookup][self::DESCRIPTION] = " .. ".
-                                     $summary[self::DESCRIPTION];
+                                     $description;
+                                $description_hash[$description] = true;
                             }
-                            foreach($summary as $attr => $value){
-                                if($attr !=self::DESCRIPTION &&
-                                    !isset($summaries[$lookup][$attr])) {
-                                    $summaries[$lookup][$attr] = $value;
-                                }
+                        }
+                        foreach($summary as $attr => $value){
+                            if($attr !=self::DESCRIPTION &&
+                                !isset($summaries[$lookup][$attr])) {
+                                $summaries[$lookup][$attr] = $value;
                             }
-                        } else {
-                            $summaries[$lookup] =  $summary;
                         }
+                    } else {
+                        $summaries[$lookup] =  $summary;
                     }
                 }
             }
-            return $summaries;
         }
+        return $summaries;
+    }
+
+    /**
+     * Gets summaries on a particular machine for a set of document by
+     * their url, or by group of 5-tuples of the form
+     * (machine, key, index, generation, offset)
+     * This may be used in either the single queue_server setting or
+     * it may be called indirectly by a particular machine's
+     * CrawlController as part of fufilling a network-based getCrawlItems
+     * request. $lookups contains items which are to be grouped (as came
+     * from same url or site with the same cache). So this function aggregates
+     * their descriptions.
+     *
+     * @param string $lookups things whose summaries we are trying to look up
+     * @param array $machine_urls an array of urls of yioop queue servers
+     * @return array of summary data for the matching documents
+     */
+    function nonNetworkGetCrawlItems($lookups)
+    {
+        $summaries = array();
         foreach($lookups as $lookup => $lookup_info) {
             if(count($lookup_info) == 2 && $lookup_info[0][0] === 'h') {
                 list($url, $index_name) = $lookup_info;
@@ -188,6 +236,7 @@ class ParallelModel extends Model implements CrawlConstants
                     $index_archive->getPage($summary_offset, $generation);
             } else {
                 $summary = array();
+                $description_hash = array();
                 foreach($lookup_info as $lookup_item) {
                     if(count($lookup_item) == 2) {
                         list($word_key, $index_name) = $lookup_item;
@@ -207,27 +256,31 @@ class ParallelModel extends Model implements CrawlConstants
                     $index->setCurrentShard($generation, true);
                     $page = @$index->getPage($summary_offset);
                     if(!$page || $page == array()) {continue;}
-                    $ellipsis_used = false;
                     $copy = false;
                     if($summary == array()) {
+                        if(isset($page[self::DESCRIPTION])) {
+                            $description = trim($page[self::DESCRIPTION]);
+                            $page[self::DESCRIPTION] = $description;
+                            $description_hash[$description] = true;
+                        }
                         $summary = $page;
                     } else if (isset($page[self::DESCRIPTION])) {
+                        $description = trim($page[self::DESCRIPTION]);
                         if(!isset($summary[self::DESCRIPTION])) {
                             $summary[
                                 self::DESCRIPTION] = "";
                         }
-                        $summary[self::DESCRIPTION].=
-                            " .. ".$page[self::DESCRIPTION];
-                        $ellipsis_used = true;
+                        if(!isset($description_hash[$description])){
+                            $summary[self::DESCRIPTION] .=
+                                " .. ".$description;
+                            $description_hash[$description] = true;
+                        }
                         $copy = true;
                     } else {
                         $copy = true;
                     }
-                    if($ellipsis_used && strlen($summary[self::DESCRIPTION]) >
+                    if(strlen($summary[self::DESCRIPTION]) >
                         self::MIN_DESCRIPTION_LENGTH) {
-                        /* want at least one ellipsis in case terms only
-                           appear in links
-                         */
                         break;
                     }
                     if($copy) {
ViewGit