Fix bug if site's landing page gets classified as a video, give doc rank bonuses for company level domains

Chris Pollett [2022-07-26 21:Jul:th]
Fix bug if site's landing page gets classified as a video, give doc rank bonuses for company level domains
Filename
src/controllers/SearchController.php
src/library/IndexDocumentBundle.php
src/library/index_bundle_iterators/WordIterator.php
diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php
index c8a48a255..d9ed7d1e3 100755
--- a/src/controllers/SearchController.php
+++ b/src/controllers/SearchController.php
@@ -1493,7 +1493,8 @@ EOD;
                     $out_pages[$first_image]['IMAGES'] = [];
                 }
                 $out_pages[$first_image]['IMAGES'][] = $page;
-            } else if (!empty($page[self::IS_VIDEO])) {
+            } else if (!empty($page[self::IS_VIDEO]) &&
+                !L\IndexDocumentBundle::isAHostDocId($page[self::KEY])) {
                 if ($first_video == -1) {
                     $first_video = count($out_pages);
                     $out_pages[$first_video]['VIDEOS'] = [];
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index 65c885344..1cfc75ea8 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -855,6 +855,11 @@ class IndexDocumentBundle implements CrawlConstants
             }
             $site_url = str_replace('|', "%7C", $site[self::URL]);
             $host = UrlParser::getHost($site_url);
+            $cld = UrlParser::getCompanyLevelDomain($site_url);
+            if (in_array($site_url, ["https://$cld/",
+                "https://www.$cld/", "http://$cld/", "http://www.$cld/"])) {
+                $letter_code = chr(ord($letter_code) + 128);
+            }
             $doc_id = crawlHash($site_url, true) . $hash .
                 $letter_code . substr(crawlHash($host . "/", true), 1);
         }
@@ -963,6 +968,7 @@ class IndexDocumentBundle implements CrawlConstants
      * Checks if a doc_id $key is that of a host url.
      * I.e., a url https://www.yahoo.com/ as opposed to
      * https://www.yahoo.com/foo
+     * @param string $key to check if doc or not
      */
     public static function isAHostDocId($key)
     {
@@ -972,6 +978,17 @@ class IndexDocumentBundle implements CrawlConstants
         }
         return false;
     }
+    /**
+     * Checks if a doc_id $key is that of a Company level domain (cld) or
+     * www.cld.
+     * I.e., a url https://yahoo.com/  or https://www.yahoo.com/ as opposed to
+     * https://foo.yahoo.com/
+     * @param string $key to check if doc or not
+     */
+    public static function isACldDocId($key)
+    {
+        return (ord($key[self::DOCID_PART_LEN << 1] ?? '\0') & 128) > 0;
+    }
     /**
      * Checks if a doc_id corresponds to a particular large scale type among
      * external_link, internal_link, link (union of previous two),
@@ -1005,7 +1022,7 @@ class IndexDocumentBundle implements CrawlConstants
             $types = array_merge($types, ["binary", "feed", "image",
                 "old_doc", "text", "video"]);
         }
-        $key_type = ($key[self::DOCID_PART_LEN << 1] ?? "");
+        $key_type = chr(ord($key[self::DOCID_PART_LEN << 1] ?? 0) & 127);
         return in_array($type_map[$key_type] ?? "old_link", $types);
     }
     /**
diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php
index e75730ed5..b8da49611 100644
--- a/src/library/index_bundle_iterators/WordIterator.php
+++ b/src/library/index_bundle_iterators/WordIterator.php
@@ -485,7 +485,9 @@ class WordIterator extends IndexBundleIterator
                     $remaining_partitions * $this->avg_items_per_partition +
                     $posting[self::SCORE], 10);
                 if(L\IndexDocumentBundle::isAHostDocId($doc_key)) {
-                    $posting[self::DOC_RANK] += 2;
+                    $posting[self::DOC_RANK] +=
+                        (L\IndexDocumentBundle::isACldDocId($doc_key)) ?
+                        2 : 0.5;
                 }
             }
             list($posting['TITLE_LENGTH'], $num_description_scores) =
ViewGit