Fixes a bug in how deduplication was handled, a=chris

Chris Pollett [2010-09-17 18:Sep:th]

Fixes a bug in how deduplication was handled, a=chris

Filename
bin/fetcher.php
bin/queue_server.php
controllers/fetch_controller.php
lib/index_bundle_iterators/group_iterator.php
locale/en-US/statistics.txt

diff --git a/bin/fetcher.php b/bin/fetcher.php
index 676e310cc..ffc8a3140 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -572,22 +572,31 @@ class Fetcher implements CrawlConstants
         $deduplicated_pages = array();
         $not_downloaded = array();
         $duplicates = array();
-
+
+        /*
+            Time to Deduplicate!
+            $unseen_page_hashes array to check against all before this batch,
+            $seen_pages to check against this batch
+        */
         $unseen_page_hashes =
             $this->web_archive->differencePageKeysFilter($site_pages,
             self::HASH);
+        $seen_pages = array();

         foreach($site_pages as $site) {
             if( isset($site[self::ROBOT_PATHS])) {
                 $deduplicated_pages[] = $site;
             } else if (isset($site[self::HASH]) && in_array($site[self::HASH],
-                $unseen_page_hashes)) {
+                $unseen_page_hashes) && !in_array($site[self::HASH],
+                $seen_pages)) {
                 $this->web_archive->addPageFilter(self::HASH, $site);
                 $deduplicated_pages[] = $site;
+                $seen_pages[] = $site[self::HASH];
             } else if(!isset($site[self::HASH])){
                 $not_downloaded[] = $site;
             } else {
                 $duplicates[] = $site[self::URL];
+                echo "Deduplicated:".$site[self::URL]."\n";
             }

         }
diff --git a/bin/queue_server.php b/bin/queue_server.php
index f4ca997c8..8677edf76 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -522,6 +522,7 @@ class QueueServer implements CrawlConstants
         $machine = $machine_info[self::MACHINE];
         $machine_uri = $machine_info[self::MACHINE_URI];

+        //do deduplication of summaries
         if(isset($sites[self::SEEN_URLS]) &&
             count($sites[self::SEEN_URLS]) > 0) {
             $seen_sites = $sites[self::SEEN_URLS];
@@ -851,6 +852,10 @@ class QueueServer implements CrawlConstants
         unlink($file);

         $crawl_status = array();
+        $stat_file = CRAWL_DIR."/schedules/crawl_status.txt";
+        if(file_exists($stat_file) ) {
+            $crawl_status = unserialize(file_get_contents($stat_file));
+        }
         $crawl_status['MOST_RECENT_FETCHER'] = $this->most_recent_fetcher;
         if(isset($sites[self::RECENT_URLS])) {
             $crawl_status['MOST_RECENT_URLS_SEEN'] = $sites[self::RECENT_URLS];
@@ -863,9 +868,8 @@ class QueueServer implements CrawlConstants
         $crawl_status['VISITED_URLS_COUNT'] =$info_bundle['VISITED_URLS_COUNT'];
         $crawl_status['DESCRIPTION'] = $index_archive_info['DESCRIPTION'];
         $crawl_status['QUEUE_PEAK_MEMORY'] = memory_get_peak_usage();
-        file_put_contents(
-            CRAWL_DIR."/schedules/crawl_status.txt", serialize($crawl_status));
-        chmod(CRAWL_DIR."/schedules/crawl_status.txt", 0777);
+        file_put_contents($stat_file, serialize($crawl_status));
+        chmod($stat_file, 0777);
         crawlLog(
             "End checking for new URLs data memory usage".memory_get_usage());

diff --git a/controllers/fetch_controller.php b/controllers/fetch_controller.php
index 9f2af505e..5e4b4b2aa 100755
--- a/controllers/fetch_controller.php
+++ b/controllers/fetch_controller.php
@@ -148,7 +148,7 @@ class FetchController extends Controller implements CrawlConstants
                     file_get_contents(CRAWL_DIR."/schedules/crawl_status.txt"));
                 if(isset($_REQUEST['fetcher_peak_memory'])) {
                     if(!isset($crawl_status['FETCHER_MEMORY']) ||
-                        $_REQUEST['fetcher_peak_memory'] <
+                        $_REQUEST['fetcher_peak_memory'] >
                         $crawl_status['FETCHER_PEAK_MEMORY']
                     ) {
                         $crawl_status['FETCHER_PEAK_MEMORY'] =
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index 700beaca3..74adf2085 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -207,11 +207,18 @@ class GroupIterator extends IndexBundleIterator
                     if(is_array($doc_array) && count($doc_array) == 1) {
                         $keys = array_keys($doc_array);
                         $key = $keys[0];
-                        if($doc_array[$key][self::SCORE] > 0) {
+                        if($doc_array[$key][self::DOC_RANK] > -1) {
                             $pre_out_pages[$hash_url][$key] = $doc_array[$key];
                             $pre_out_pages[$hash_url][$key]['IS_PAGE'] = true;
-                        } else {
+                        } else {
+                            /*
+                                Deduplication: idea is if the score < 0
+                                a deduplicate info: page was written, so
+                                we should ignore that group.
+                            */
                             unset($pre_out_pages[$hash_url]);
+                            $this->grouped_keys[$hash_url] = true;
+                            //mark we have seen this group
                         }
                     }
                 } else {
diff --git a/locale/en-US/statistics.txt b/locale/en-US/statistics.txt
index b6bef56f0..5a165df53 100755
--- a/locale/en-US/statistics.txt
+++ b/locale/en-US/statistics.txt
@@ -1 +1 @@
-d:99;
\ No newline at end of file
+d:100;
\ No newline at end of file

ViewGit