Code to try to prevent single news sources from being over-represented in news, a=chris
Code to try to prevent single news sources from being over-represented in news, a=chris
diff --git a/src/data/public_default.db b/src/data/public_default.db
index fca4e6de9..83fe0e370 100644
Binary files a/src/data/public_default.db and b/src/data/public_default.db differ
diff --git a/src/library/media_jobs/FeedsUpdateJob.php b/src/library/media_jobs/FeedsUpdateJob.php
index 34367ce0a..52b904455 100644
--- a/src/library/media_jobs/FeedsUpdateJob.php
+++ b/src/library/media_jobs/FeedsUpdateJob.php
@@ -83,6 +83,12 @@ class FeedsUpdateJob extends MediaJob
* Mamimum number of thumb_urls to download in one try
*/
const MAX_THUMBS_ONE_GO = 100;
+ /**
+ * For a given feed update, the factor extra to allow the number of items
+ * from a single source a compared to the average items should have
+ * per source.
+ */
+ const SINGLE_SOURCE_FACTOR = 1.2;
/**
* how long in seconds before a feed item expires
*/
@@ -686,6 +692,26 @@ class FeedsUpdateJob extends MediaJob
$seen_url_count = 0;
$index_archive = $this->getFeedBundle();
$limit = C\NUM_DOCS_PER_PARTITION;
+ $source_names_count = 0;
+ $source_counts = [];
+ $items_count = 0;
+ foreach ($items as $item) {
+ if (!empty($item['SOURCE_NAME'])) {
+ $items_count++;
+ }
+ if (!empty($item['SOURCE_NAME']) &&
+ !isset($source_counts[$item['SOURCE_NAME']])) {
+ $source_counts[$item['SOURCE_NAME']] = 0;
+ $source_names_count++;
+ }
+ }
+ $average_items_per_source =
+ ceil($items_count / max($source_names_count, 1));
+ $single_source_threshold = self::SINGLE_SOURCE_FACTOR *
+ $average_items_per_source;
+ L\crawlLog("Single source threshold for this update is ".
+ "$single_source_threshold ");
+ $sources_with_dropped_items = [];
foreach ($items as $item) {
L\crawlTimeoutLog(
"----..have added %s items to new item store.", $i);
@@ -693,6 +719,13 @@ class FeedsUpdateJob extends MediaJob
if (!isset($item['SOURCE_NAME'])) {
continue;
}
+ $source_name = $item['SOURCE_NAME'];
+ if (!empty($source_counts[$source_name]) &&
+ $source_counts[$source_name] > $single_source_threshold) {
+ $sources_with_dropped_items[$source_name] = 1;
+ continue;
+ }
+ $source_counts[$source_name]++;
$raw_guid = L\unbase64Hash($item["GUID"]);
$doc_id = L\crawlHash($item['LINK'], true) . $raw_guid .
"f" . substr(L\crawlHash(UrlParser::getHost($item['LINK']) .
@@ -719,6 +752,13 @@ class FeedsUpdateJob extends MediaJob
}
}
if (!empty($seen_sites)) {
+ if (!empty($sources_with_dropped_items)) {
+ $dropped_sources = array_keys($sources_with_dropped_items);
+ L\crawlLog("The following sources had dropped items:");
+ foreach($dropped_sources as $dropped_source) {
+ L\crawlLog(" $dropped_source");
+ }
+ }
L\crawlLog("Adding " . count($seen_sites) ." pages and keys");
$index_archive->addPagesAndSeenKeys($seen_sites, $seen_url_count);
}