Fixes a bug in how deduplication was handled, a=chris
Fixes a bug in how deduplication was handled, a=chris
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 676e310cc..ffc8a3140 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -572,22 +572,31 @@ class Fetcher implements CrawlConstants
$deduplicated_pages = array();
$not_downloaded = array();
$duplicates = array();
-
+
+ /*
+ Time to Deduplicate!
+ $unseen_page_hashes array to check against all before this batch,
+ $seen_pages to check against this batch
+ */
$unseen_page_hashes =
$this->web_archive->differencePageKeysFilter($site_pages,
self::HASH);
+ $seen_pages = array();
foreach($site_pages as $site) {
if( isset($site[self::ROBOT_PATHS])) {
$deduplicated_pages[] = $site;
} else if (isset($site[self::HASH]) && in_array($site[self::HASH],
- $unseen_page_hashes)) {
+ $unseen_page_hashes) && !in_array($site[self::HASH],
+ $seen_pages)) {
$this->web_archive->addPageFilter(self::HASH, $site);
$deduplicated_pages[] = $site;
+ $seen_pages[] = $site[self::HASH];
} else if(!isset($site[self::HASH])){
$not_downloaded[] = $site;
} else {
$duplicates[] = $site[self::URL];
+ echo "Deduplicated:".$site[self::URL]."\n";
}
}
diff --git a/bin/queue_server.php b/bin/queue_server.php
index f4ca997c8..8677edf76 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -522,6 +522,7 @@ class QueueServer implements CrawlConstants
$machine = $machine_info[self::MACHINE];
$machine_uri = $machine_info[self::MACHINE_URI];
+ //do deduplication of summaries
if(isset($sites[self::SEEN_URLS]) &&
count($sites[self::SEEN_URLS]) > 0) {
$seen_sites = $sites[self::SEEN_URLS];
@@ -851,6 +852,10 @@ class QueueServer implements CrawlConstants
unlink($file);
$crawl_status = array();
+ $stat_file = CRAWL_DIR."/schedules/crawl_status.txt";
+ if(file_exists($stat_file) ) {
+ $crawl_status = unserialize(file_get_contents($stat_file));
+ }
$crawl_status['MOST_RECENT_FETCHER'] = $this->most_recent_fetcher;
if(isset($sites[self::RECENT_URLS])) {
$crawl_status['MOST_RECENT_URLS_SEEN'] = $sites[self::RECENT_URLS];
@@ -863,9 +868,8 @@ class QueueServer implements CrawlConstants
$crawl_status['VISITED_URLS_COUNT'] =$info_bundle['VISITED_URLS_COUNT'];
$crawl_status['DESCRIPTION'] = $index_archive_info['DESCRIPTION'];
$crawl_status['QUEUE_PEAK_MEMORY'] = memory_get_peak_usage();
- file_put_contents(
- CRAWL_DIR."/schedules/crawl_status.txt", serialize($crawl_status));
- chmod(CRAWL_DIR."/schedules/crawl_status.txt", 0777);
+ file_put_contents($stat_file, serialize($crawl_status));
+ chmod($stat_file, 0777);
crawlLog(
"End checking for new URLs data memory usage".memory_get_usage());
diff --git a/controllers/fetch_controller.php b/controllers/fetch_controller.php
index 9f2af505e..5e4b4b2aa 100755
--- a/controllers/fetch_controller.php
+++ b/controllers/fetch_controller.php
@@ -148,7 +148,7 @@ class FetchController extends Controller implements CrawlConstants
file_get_contents(CRAWL_DIR."/schedules/crawl_status.txt"));
if(isset($_REQUEST['fetcher_peak_memory'])) {
if(!isset($crawl_status['FETCHER_MEMORY']) ||
- $_REQUEST['fetcher_peak_memory'] <
+ $_REQUEST['fetcher_peak_memory'] >
$crawl_status['FETCHER_PEAK_MEMORY']
) {
$crawl_status['FETCHER_PEAK_MEMORY'] =
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index 700beaca3..74adf2085 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -207,11 +207,18 @@ class GroupIterator extends IndexBundleIterator
if(is_array($doc_array) && count($doc_array) == 1) {
$keys = array_keys($doc_array);
$key = $keys[0];
- if($doc_array[$key][self::SCORE] > 0) {
+ if($doc_array[$key][self::DOC_RANK] > -1) {
$pre_out_pages[$hash_url][$key] = $doc_array[$key];
$pre_out_pages[$hash_url][$key]['IS_PAGE'] = true;
- } else {
+ } else {
+ /*
+ Deduplication: idea is if the score < 0
+ a deduplicate info: page was written, so
+ we should ignore that group.
+ */
unset($pre_out_pages[$hash_url]);
+ $this->grouped_keys[$hash_url] = true;
+ //mark we have seen this group
}
}
} else {
diff --git a/locale/en-US/statistics.txt b/locale/en-US/statistics.txt
index b6bef56f0..5a165df53 100755
--- a/locale/en-US/statistics.txt
+++ b/locale/en-US/statistics.txt
@@ -1 +1 @@
-d:99;
\ No newline at end of file
+d:100;
\ No newline at end of file