Fix bug if site's landing page gets classified as a video, give doc rank bonuses for company level domains
Fix bug if site's landing page gets classified as a video, give doc rank bonuses for company level domains
diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php
index c8a48a255..d9ed7d1e3 100755
--- a/src/controllers/SearchController.php
+++ b/src/controllers/SearchController.php
@@ -1493,7 +1493,8 @@ EOD;
$out_pages[$first_image]['IMAGES'] = [];
}
$out_pages[$first_image]['IMAGES'][] = $page;
- } else if (!empty($page[self::IS_VIDEO])) {
+ } else if (!empty($page[self::IS_VIDEO]) &&
+ !L\IndexDocumentBundle::isAHostDocId($page[self::KEY])) {
if ($first_video == -1) {
$first_video = count($out_pages);
$out_pages[$first_video]['VIDEOS'] = [];
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index 65c885344..1cfc75ea8 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -855,6 +855,11 @@ class IndexDocumentBundle implements CrawlConstants
}
$site_url = str_replace('|', "%7C", $site[self::URL]);
$host = UrlParser::getHost($site_url);
+ $cld = UrlParser::getCompanyLevelDomain($site_url);
+ if (in_array($site_url, ["https://$cld/",
+ "https://www.$cld/", "http://$cld/", "http://www.$cld/"])) {
+ $letter_code = chr(ord($letter_code) + 128);
+ }
$doc_id = crawlHash($site_url, true) . $hash .
$letter_code . substr(crawlHash($host . "/", true), 1);
}
@@ -963,6 +968,7 @@ class IndexDocumentBundle implements CrawlConstants
* Checks if a doc_id $key is that of a host url.
* I.e., a url https://www.yahoo.com/ as opposed to
* https://www.yahoo.com/foo
+ * @param string $key to check if doc or not
*/
public static function isAHostDocId($key)
{
@@ -972,6 +978,17 @@ class IndexDocumentBundle implements CrawlConstants
}
return false;
}
+ /**
+ * Checks if a doc_id $key is that of a Company level domain (cld) or
+ * www.cld.
+ * I.e., a url https://yahoo.com/ or https://www.yahoo.com/ as opposed to
+ * https://foo.yahoo.com/
+ * @param string $key to check if doc or not
+ */
+ public static function isACldDocId($key)
+ {
+ return (ord($key[self::DOCID_PART_LEN << 1] ?? '\0') & 128) > 0;
+ }
/**
* Checks if a doc_id corresponds to a particular large scale type among
* external_link, internal_link, link (union of previous two),
@@ -1005,7 +1022,7 @@ class IndexDocumentBundle implements CrawlConstants
$types = array_merge($types, ["binary", "feed", "image",
"old_doc", "text", "video"]);
}
- $key_type = ($key[self::DOCID_PART_LEN << 1] ?? "");
+ $key_type = chr(ord($key[self::DOCID_PART_LEN << 1] ?? 0) & 127);
return in_array($type_map[$key_type] ?? "old_link", $types);
}
/**
diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php
index e75730ed5..b8da49611 100644
--- a/src/library/index_bundle_iterators/WordIterator.php
+++ b/src/library/index_bundle_iterators/WordIterator.php
@@ -485,7 +485,9 @@ class WordIterator extends IndexBundleIterator
$remaining_partitions * $this->avg_items_per_partition +
$posting[self::SCORE], 10);
if(L\IndexDocumentBundle::isAHostDocId($doc_key)) {
- $posting[self::DOC_RANK] += 2;
+ $posting[self::DOC_RANK] +=
+ (L\IndexDocumentBundle::isACldDocId($doc_key)) ?
+ 2 : 0.5;
}
}
list($posting['TITLE_LENGTH'], $num_description_scores) =