Use canonicalTerm as id for term embeddings in Recommendation job, r=chris

Parth Patel [2022-12-15 06:Dec:th]
Use canonicalTerm as id for term embeddings in Recommendation job, r=chris

Signed-off-by: Chris Pollett <chris@pollett.org>
Filename
src/configs/Config.php
src/library/VersionFunctions.php
src/library/media_jobs/RecommendationJob.php
src/models/ProfileModel.php
tests/ManyUserExperiment.php
diff --git a/src/configs/Config.php b/src/configs/Config.php
index 662b6ae93..7aa3ba2ba 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -162,7 +162,7 @@ nsconddefine('GENERATOR_STRING', "Yioop");
  * Version number for upgrade database function
  * @var int
  */
-nsdefine('DATABASE_VERSION', 76);
+nsdefine('DATABASE_VERSION', 77);
 /**
  * Minimum Version fo Yioop for which keyword ad script
  * still works with this version
diff --git a/src/library/VersionFunctions.php b/src/library/VersionFunctions.php
index dac1149f4..c66ef3531 100644
--- a/src/library/VersionFunctions.php
+++ b/src/library/VersionFunctions.php
@@ -2073,7 +2073,7 @@ function upgradeDatabaseVersion75(&$db)
         "GROUP_RESOURCE_RECOMMENDATION (USER_ID)");
 }
 /**
- * Upgrades a Version 76 version of the Yioop database to a Version 75 version
+ * Upgrades a Version 75 version of the Yioop database to a Version 76 version
  * @param object $db datasource to use to upgrade
  */
 function upgradeDatabaseVersion76(&$db)
@@ -2092,3 +2092,17 @@ function upgradeDatabaseVersion76(&$db)
             32 * C\EMBEDDING_VECTOR_SIZE . "), " .
         "PARENT_ID $integer, PRIMARY KEY(ID, ITEM_TYPE))");
 }
+/**
+ * Upgrades a Version 76 version of the Yioop database to a Version 77 version
+ * @param object $db datasource to use to upgrade
+ */
+function upgradeDatabaseVersion77(&$db)
+{
+    $dbinfo = ["DBMS" => C\DBMS, "DB_HOST" => C\DB_HOST,
+        "DB_NAME" => C\DB_NAME, "DB_PASSWORD" => C\DB_PASSWORD];
+    $integer = $db->integerType($dbinfo);
+    $db->execute("DROP TABLE RECOMMENDATION_TERM_EMBEDDING");
+    $db->execute("CREATE TABLE RECOMMENDATION_TERM_EMBEDDING " .
+        "(ID VARCHAR(16) NOT NULL, ITEM_TYPE $integer NOT NULL, VECTOR VARCHAR("
+        . 32 * C\EMBEDDING_VECTOR_SIZE . "), PRIMARY KEY(ID, ITEM_TYPE))");
+}
diff --git a/src/library/media_jobs/RecommendationJob.php b/src/library/media_jobs/RecommendationJob.php
index 920442ca2..061052364 100644
--- a/src/library/media_jobs/RecommendationJob.php
+++ b/src/library/media_jobs/RecommendationJob.php
@@ -98,6 +98,10 @@ class RecommendationJob extends MediaJob
     const DESCRIPTION_STOP_WORDS = ["author", "authors", "plot", "genre",
         "genres", "star", "stars", "credits", "rating", "ratings", "year",
         "director", "cast", "runtime"];
+    /**
+     * Hash algorithm to be used for calculating hash in Hash2Vec embedding
+     */
+    const HASH_ALGORITHM = "md5";
     /**
      * Hash algorithm to be used for calculating sign in Hash2Vec term embedding
      */
@@ -320,7 +324,7 @@ class RecommendationJob extends MediaJob
         $db = $this->db;
         $this->lru_cache = new LRUCache(self::MAX_TERM_EMBEDDINGS);
         $select_sql = "SELECT * FROM RECOMMENDATION_TERM_EMBEDDING WHERE" .
-            " ITEM_TYPE = ?" . $db->limitOffset(self::MAX_TERM_EMBEDDINGS);
+            " ITEM_TYPE = ? " . $db->limitOffset(self::MAX_TERM_EMBEDDINGS);
         $results = $db->execute($select_sql, [C\THREAD_RECOMMENDATION]);
         $term_embeddings = [];
         $item_terms = [];
@@ -359,7 +363,8 @@ class RecommendationJob extends MediaJob
             for ($i = 0; $i < $num_terms; $i++) {
                 L\crawlTimeoutLog("Have processed $i of $num_terms terms");
                 [$term_id, $term] = $terms[$i];
-                $term_hash = $term_id % C\EMBEDDING_VECTOR_SIZE + 1;
+                $term_hash = unpack('N', hash(self::HASH_ALGORITHM, $term, true)
+                    )[1] % C\EMBEDDING_VECTOR_SIZE + 1;
                 $term_sign_hash = hash(self::SIGN_HASH_ALGORITHM, $term, true);
                 $term_sign = unpack('N', $term_sign_hash)[1] % 2 == 0 ? -1 : 1;
                 $term_embedding = $this->getTermEmbedding($term_id,
@@ -373,8 +378,8 @@ class RecommendationJob extends MediaJob
                     $context_term_embedding = unpack("E*",
                         $context_term_embedding);
                     $weight = exp(-1 * pow(($i - $j) / $std_deviation, 2));
-                    $context_term_hash = $context_term_id %
-                        C\EMBEDDING_VECTOR_SIZE + 1;
+                    $context_term_hash = unpack('N', hash(self::HASH_ALGORITHM,
+                        $context_term, true))[1] % C\EMBEDDING_VECTOR_SIZE + 1;
                     $context_term_sign_hash = hash(self::SIGN_HASH_ALGORITHM,
                         $context_term, true);
                     $context_term_sign = unpack('N', $context_term_sign_hash)[1]
@@ -932,7 +937,7 @@ class RecommendationJob extends MediaJob
         $db = $this->db;
         $this->lru_cache = new LRUCache(self::MAX_TERM_EMBEDDINGS);
         $select_sql = "SELECT * FROM RECOMMENDATION_TERM_EMBEDDING WHERE " .
-            "ITEM_TYPE = ?" . $db->limitOffset(self::MAX_TERM_EMBEDDINGS);
+            "ITEM_TYPE = ? " . $db->limitOffset(self::MAX_TERM_EMBEDDINGS);
         $results = $db->execute($select_sql, [C\RESOURCE_RECOMMENDATION]);
         $resource_terms = [];
         $meta_details_terms = [];
@@ -976,7 +981,8 @@ class RecommendationJob extends MediaJob
                 for ($i = 0; $i < $num_terms; $i++) {
                     L\crawlTimeoutLog("Have processed $i of $num_terms terms");
                     [$term_id, $term] = $terms[$i];
-                    $term_hash = $term_id % C\EMBEDDING_VECTOR_SIZE + 1;
+                    $term_hash = unpack('N', hash(self::HASH_ALGORITHM, $term,
+                        true))[1] % C\EMBEDDING_VECTOR_SIZE + 1;
                     $term_sign_hash = hash(self::SIGN_HASH_ALGORITHM,
                         $term, true);
                     $term_sign = unpack('N', $term_sign_hash)[1]
@@ -992,7 +998,8 @@ class RecommendationJob extends MediaJob
                         $context_term_embedding = unpack("E*",
                             $context_term_embedding);
                         $weight = exp(-1 * pow(($i - $j) / $std_deviation, 2));
-                        $context_term_hash = $context_term_id %
+                        $context_term_hash = unpack('N', hash(
+                            self::HASH_ALGORITHM, $context_term, true))[1] %
                             C\EMBEDDING_VECTOR_SIZE + 1;
                         $context_term_sign_hash = hash(
                             self::SIGN_HASH_ALGORITHM, $context_term, true);
@@ -1051,7 +1058,7 @@ class RecommendationJob extends MediaJob
                 in_array($term, self::DESCRIPTION_STOP_WORDS)) {
                 continue;
             }
-            $term_id = unpack('N', md5($term, true))[1];
+            $term_id = L\canonicalTerm($term);
             $term_ids[] = [$term_id, $term];
         }
         return $term_ids;
@@ -1096,10 +1103,11 @@ class RecommendationJob extends MediaJob
                 if (strlen($meta_term) <= 1) {
                     continue;
                 }
+                $meta_term_hash = unpack('N', hash(self::HASH_ALGORITHM,
+                    $meta_term, true))[1] % C\EMBEDDING_VECTOR_SIZE + 1;
                 $sign_hash = hash(self::SIGN_HASH_ALGORITHM, $meta_term, true);
                 $sign = unpack('N', $sign_hash)[1] % 2 == 0 ? -1 : 1;
-                $item_embedding[$meta_term_id % C\EMBEDDING_VECTOR_SIZE + 1]
-                    += $sign * 1.0;
+                $item_embedding[$meta_term_hash] += $sign * 1.0;
             }
             $updated_item_embeddings[$resource_id] = pack("E*",
                 ...$item_embedding);
@@ -1337,8 +1345,8 @@ class RecommendationJob extends MediaJob
         foreach ($this->lru_cache->getAll() as $id => $embedding) {
             L\crawlTimeoutLog("Have inserted $total_insert many embeddings");
             $embedding = base64_encode($embedding);
-            $insert_sql .= "$comma($id, $item_type, '$embedding')";
-            $delete_sql .= "$comma $id";
+            $insert_sql .= "$comma('$id', $item_type, '$embedding')";
+            $delete_sql .= "$comma '$id'";
             $comma = ",";
             $insert_count++;
             $total_insert++;
diff --git a/src/models/ProfileModel.php b/src/models/ProfileModel.php
index 07f0466e2..bceef3883 100755
--- a/src/models/ProfileModel.php
+++ b/src/models/ProfileModel.php
@@ -335,7 +335,7 @@ class ProfileModel extends Model
                     32 * C\EMBEDDING_VECTOR_SIZE . "), PARENT_ID $integer,
                 PRIMARY KEY(ID, ITEM_TYPE))",
             "RECOMMENDATION_TERM_EMBEDDING" => "CREATE TABLE
-                RECOMMENDATION_TERM_EMBEDDING (ID $integer NOT NULL,
+                RECOMMENDATION_TERM_EMBEDDING (ID VARCHAR(16) NOT NULL,
                 ITEM_TYPE $integer NOT NULL, VECTOR VARCHAR(" .
                     32 * C\EMBEDDING_VECTOR_SIZE . "),
                 PRIMARY KEY(ID, ITEM_TYPE))",
diff --git a/tests/ManyUserExperiment.php b/tests/ManyUserExperiment.php
index 5fcb5b604..a9dd6819c 100644
--- a/tests/ManyUserExperiment.php
+++ b/tests/ManyUserExperiment.php
@@ -271,7 +271,7 @@ $web_series = [
     "Jhansi.mp4",
     "Dubai Bling.mp4",
     "The Mysterious Benedict Society Season 2.mp4",
-    "Sumo Do, Sumo Don't!.mp4",
+    "Sumo Do, Sumo Dont!.mp4",
     "Star Wars Tales of the Jedi.mp4",
     "Shadow Detective.mp4",
     "Tripling Season 3.mp4",
@@ -322,7 +322,7 @@ $books = [
     "Love on the Brain.pdf",
     "Only a Monster.pdf",
     "This Woven Kingdom.pdf",
-    "Delilah Green Doesn't Care.pdf",
+    "Delilah Green Doesnt Care.pdf",
     "Portrait of a Thief.pdf",
     "The Paris Apartment.pdf",
     "Foul Lady Fortune.pdf",
ViewGit