Switch blob to varchar in recommendation embeddings, a=chris

Chris Pollett [2022-12-14 05:Dec:th]
Switch blob to varchar in recommendation embeddings, a=chris
Filename
src/configs/Config.php
src/library/VersionFunctions.php
src/library/media_jobs/RecommendationJob.php
src/models/ProfileModel.php
diff --git a/src/configs/Config.php b/src/configs/Config.php
index 306330d75..19280d973 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -162,7 +162,7 @@ nsconddefine('GENERATOR_STRING', "Yioop");
  * Version number for upgrade database function
  * @var int
  */
-nsdefine('DATABASE_VERSION', 75);
+nsdefine('DATABASE_VERSION', 76);
 /**
  * Minimum Version fo Yioop for which keyword ad script
  * still works with this version
@@ -1264,6 +1264,8 @@ nsdefine('LONG_NAME_LEN', 64);
 nsdefine('SHORT_TITLE_LEN', 128);
 /** Length for names of things like titles of blog entries, etc */
 nsdefine('TITLE_LEN', 512);
+/** Number of components in a term or item embedding */
+nsdefine('EMBEDDING_VECTOR_SIZE', 200);
 /** Length of a feed item or post, etc */
 nsdefine('MAX_GROUP_POST_LEN', 8192);
 /** Length for for the contents of a wiki_page */
diff --git a/src/library/VersionFunctions.php b/src/library/VersionFunctions.php
index 763a33d73..dac1149f4 100644
--- a/src/library/VersionFunctions.php
+++ b/src/library/VersionFunctions.php
@@ -2072,3 +2072,23 @@ function upgradeDatabaseVersion75(&$db)
     $db->execute("CREATE INDEX GR_RECOMMENDATION_INDEX ON " .
         "GROUP_RESOURCE_RECOMMENDATION (USER_ID)");
 }
+/**
+ * Upgrades a Version 76 version of the Yioop database to a Version 75 version
+ * @param object $db datasource to use to upgrade
+ */
+function upgradeDatabaseVersion76(&$db)
+{
+    $dbinfo = ["DBMS" => C\DBMS, "DB_HOST" => C\DB_HOST,
+        "DB_NAME" => C\DB_NAME, "DB_PASSWORD" => C\DB_PASSWORD];
+    $integer = $db->integerType($dbinfo);
+    $db->execute("DROP TABLE RECOMMENDATION_TERM_EMBEDDING");
+    $db->execute("DROP TABLE RECOMMENDATION_ITEM_EMBEDDING");
+    $db->execute("CREATE TABLE RECOMMENDATION_TERM_EMBEDDING " .
+        "(ID $integer NOT NULL, ITEM_TYPE $integer NOT NULL, VECTOR VARCHAR(" .
+            32 * C\EMBEDDING_VECTOR_SIZE . "), " .
+        "PRIMARY KEY(ID, ITEM_TYPE))");
+    $db->execute("CREATE TABLE RECOMMENDATION_ITEM_EMBEDDING " .
+        "(ID $integer NOT NULL, ITEM_TYPE $integer NOT NULL, VECTOR VARCHAR(" .
+            32 * C\EMBEDDING_VECTOR_SIZE . "), " .
+        "PARENT_ID $integer, PRIMARY KEY(ID, ITEM_TYPE))");
+}
diff --git a/src/library/media_jobs/RecommendationJob.php b/src/library/media_jobs/RecommendationJob.php
index 1a67be9ab..74b74139b 100644
--- a/src/library/media_jobs/RecommendationJob.php
+++ b/src/library/media_jobs/RecommendationJob.php
@@ -91,10 +91,6 @@ class RecommendationJob extends MediaJob
      * Length of context window for calculating term embeddings
      */
     const CONTEXT_WINDOW_LENGTH = 5;
-    /**
-     * Size of term and item embedding vector
-     */
-    const EMBEDDING_VECTOR_SIZE = 200;
     /**
      * Update period to consider for fetching the records from
      * ITEM_IMPRESSION_SUMMARY table
@@ -133,7 +129,7 @@ class RecommendationJob extends MediaJob
         $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS). "Manager";
         $this->db = new $db_class();
         $this->db->connect();
-        $this->size = self::EMBEDDING_VECTOR_SIZE;
+        $this->size = C\EMBEDDING_VECTOR_SIZE;
     }
     /**
      * Only update if its been more than an hour since the last update
@@ -256,7 +252,7 @@ class RecommendationJob extends MediaJob
                 $insert_recommend_sql .=
                     "$comma ($user_id, {$recommendation['GROUP_ID']}, ".
                     "{$recommendation['PAGE_ID']}, " .
-                    "\'{$recommendation['RESOURCE_PATH']}\', ".
+                    "'{$recommendation['RESOURCE_PATH']}', ".
                     "{$recommendation['TOTAL_SCORE']}, {$this->update_time}, ".
                     "{$recommendation['RESOURCE_ID']})";
                 $comma = ",";
@@ -333,6 +329,7 @@ class RecommendationJob extends MediaJob
         $results = $db->execute($select_sql, [C\THREAD_RECOMMENDATION]);
         $term_embeddings = [];
         $item_terms = [];
+        L\crawlLog("Start Populating LRUCache of Embeddings...");
         while ($row = $db->fetchArray($results)) {
             if (is_string($row['VECTOR'])) {
                 $this->lru_cache->put($row['ID'],
@@ -341,6 +338,7 @@ class RecommendationJob extends MediaJob
                 var_dump($row['VECTOR']);
             }
         }
+        L\crawlLog("Finish Populating LRUCache of Embeddings");
         $context_distance_sum = (self::CONTEXT_WINDOW_LENGTH *
             (self::CONTEXT_WINDOW_LENGTH + 1)) / 2.0;
         $mean = $context_distance_sum / self::CONTEXT_WINDOW_LENGTH;
@@ -366,7 +364,7 @@ class RecommendationJob extends MediaJob
             for ($i = 0; $i < $num_terms; $i++) {
                 L\crawlTimeoutLog("Have processed $i of $num_terms terms");
                 [$term_id, $term] = $terms[$i];
-                $term_hash = $term_id % self::EMBEDDING_VECTOR_SIZE + 1;
+                $term_hash = $term_id % C\EMBEDDING_VECTOR_SIZE + 1;
                 $term_sign_hash = hash(self::SIGN_HASH_ALGORITHM, $term, true);
                 $term_sign = unpack('N', $term_sign_hash)[1] % 2 == 0 ? -1 : 1;
                 $term_embedding = $this->getTermEmbedding($term_id,
@@ -381,7 +379,7 @@ class RecommendationJob extends MediaJob
                         $context_term_embedding);
                     $weight = exp(-1 * pow(($i - $j) / $std_deviation, 2));
                     $context_term_hash = $context_term_id %
-                        self::EMBEDDING_VECTOR_SIZE + 1;
+                        C\EMBEDDING_VECTOR_SIZE + 1;
                     $context_term_sign_hash = hash(self::SIGN_HASH_ALGORITHM,
                         $context_term, true);
                     $context_term_sign = unpack('N', $context_term_sign_hash)[1]
@@ -418,7 +416,7 @@ class RecommendationJob extends MediaJob
         $item_count = 0;
         foreach ($item_terms as $item_id => [$terms, $group_id]) {
             L\crawlTimeoutLog("Have done $item_count many group items");
-            $item_embedding = array_fill(1, self::EMBEDDING_VECTOR_SIZE, 0);
+            $item_embedding = array_fill(1, C\EMBEDDING_VECTOR_SIZE, 0);
             foreach ($terms as [$term_id, $term]) {
                 $term_embedding = $this->getTermEmbedding($term_id,
                     C\THREAD_RECOMMENDATION, true);
@@ -503,7 +501,7 @@ class RecommendationJob extends MediaJob
             $item_ids = explode(",", $row['ITEM_IDS']);
             $item_ids = array_unique($item_ids);
             $item_user_embeddings[$user_id] = array_fill(1,
-                self::EMBEDDING_VECTOR_SIZE, 0);
+                C\EMBEDDING_VECTOR_SIZE, 0);
             $user_items[$user_id] = [];
             foreach ($item_ids as $item_id) {
                 if (array_key_exists($item_id, $item_embeddings)) {
@@ -710,7 +708,7 @@ class RecommendationJob extends MediaJob
             $group_ids = explode(",", $row['ITEM_IDS']);
             $group_ids = array_unique($group_ids);
             $group_user_embeddings[$user_id] = array_fill(1,
-                self::EMBEDDING_VECTOR_SIZE, 0);
+                C\EMBEDDING_VECTOR_SIZE, 0);
             $user_groups[$user_id] = [];
             $group_count = 0;
             foreach ($group_ids as $group_id) {
@@ -982,7 +980,7 @@ class RecommendationJob extends MediaJob
                 for ($i = 0; $i < $num_terms; $i++) {
                     L\crawlTimeoutLog("Have processed $i of $num_terms terms");
                     [$term_id, $term] = $terms[$i];
-                    $term_hash = $term_id % self::EMBEDDING_VECTOR_SIZE + 1;
+                    $term_hash = $term_id % C\EMBEDDING_VECTOR_SIZE + 1;
                     $term_sign_hash = hash(self::SIGN_HASH_ALGORITHM,
                         $term, true);
                     $term_sign = unpack('N', $term_sign_hash)[1]
@@ -999,7 +997,7 @@ class RecommendationJob extends MediaJob
                             $context_term_embedding);
                         $weight = exp(-1 * pow(($i - $j) / $std_deviation, 2));
                         $context_term_hash = $context_term_id %
-                            self::EMBEDDING_VECTOR_SIZE + 1;
+                            C\EMBEDDING_VECTOR_SIZE + 1;
                         $context_term_sign_hash = hash(
                             self::SIGN_HASH_ALGORITHM, $context_term, true);
                         $context_term_sign = unpack('N',
@@ -1079,7 +1077,7 @@ class RecommendationJob extends MediaJob
         $resource_count = 0;
         foreach ($resource_terms as $resource_id => $terms) {
             L\crawlTimeoutLog("Have processed $resource_count many resources");
-            $item_embedding = array_fill(1, self::EMBEDDING_VECTOR_SIZE, 0);
+            $item_embedding = array_fill(1, C\EMBEDDING_VECTOR_SIZE, 0);
             foreach ($terms as [$term_id, $term]) {
                 $term_embedding = $this->getTermEmbedding($term_id,
                     C\RESOURCE_RECOMMENDATION, true);
@@ -1093,7 +1091,7 @@ class RecommendationJob extends MediaJob
         }
         foreach ($meta_details_terms as $resource_id => $meta_terms) {
             if (!array_key_exists($resource_id, $updated_item_embeddings)) {
-                $item_embedding = array_fill(1, self::EMBEDDING_VECTOR_SIZE, 0);
+                $item_embedding = array_fill(1, C\EMBEDDING_VECTOR_SIZE, 0);
             } else {
                 $item_embedding = unpack("E*",
                     $updated_item_embeddings[$resource_id]);
@@ -1104,7 +1102,7 @@ class RecommendationJob extends MediaJob
                 }
                 $sign_hash = hash(self::SIGN_HASH_ALGORITHM, $meta_term, true);
                 $sign = unpack('N', $sign_hash)[1] % 2 == 0 ? -1 : 1;
-                $item_embedding[$meta_term_id%self::EMBEDDING_VECTOR_SIZE + 1]
+                $item_embedding[$meta_term_id % C\EMBEDDING_VECTOR_SIZE + 1]
                     += $sign * 1.0;
             }
             $updated_item_embeddings[$resource_id] = pack("E*",
@@ -1181,7 +1179,7 @@ class RecommendationJob extends MediaJob
             $item_ids = explode(",", $row['ITEM_IDS']);
             $item_ids = array_unique($item_ids);
             $user_embeddings[$user_id] = array_fill(1,
-                self::EMBEDDING_VECTOR_SIZE, 0);
+                C\EMBEDDING_VECTOR_SIZE, 0);
             $user_items[$user_id] = [];
             foreach ($item_ids as $item_id) {
                 if (array_key_exists($item_id, $item_embeddings)) {
@@ -1249,7 +1247,7 @@ class RecommendationJob extends MediaJob
                 $score, $item_id) = $recommendation;
             $time = $this->update_time;
             $insert_sql .= "$comma($user_id, $group_id, $page_id, " .
-                "\'$resource_path\', $score, $time, $item_id)";
+                "'$resource_pathd', $score, $time, $item_id)";
             $comma = ",";
             $insert_count++;
             $total_insert++;
@@ -1285,7 +1283,7 @@ class RecommendationJob extends MediaJob
             $row = $db->fetchArray($result);
             if (!$row || !is_string($row['VECTOR'])) {
                 $term_embedding = pack("E*", ...array_fill(1,
-                    self::EMBEDDING_VECTOR_SIZE, 0.0));
+                    C\EMBEDDING_VECTOR_SIZE, 0.0));
             } else {
                 $term_embedding = base64_decode($row['VECTOR'], true);
             }
diff --git a/src/models/ProfileModel.php b/src/models/ProfileModel.php
index 55a320906..07f0466e2 100755
--- a/src/models/ProfileModel.php
+++ b/src/models/ProfileModel.php
@@ -121,7 +121,6 @@ class ProfileModel extends Model
         $auto_increment = $dbm->autoIncrement($dbinfo);
         $serial = $dbm->serialType($dbinfo);
         $integer = $dbm->integerType($dbinfo);
-        $blob = $dbm->blobType($dbinfo);
         $page_type = $dbm->pageType($dbinfo);
         $scraper_text = (stristr($dbinfo['DBMS'], "mysql") !== false) ?
             " TEXT " :  " VARCHAR(" . (10 * C\MAX_URL_LEN) . ") ";
@@ -332,11 +331,13 @@ class ProfileModel extends Model
                 PRIMARY KEY(ROLE_ID, ACTIVITY_ID))",
             "RECOMMENDATION_ITEM_EMBEDDING" => "CREATE TABLE
                 RECOMMENDATION_ITEM_EMBEDDING (ID $integer NOT NULL,
-                ITEM_TYPE $integer NOT NULL, VECTOR $blob, PARENT_ID $integer,
+                ITEM_TYPE $integer NOT NULL, VECTOR VARCHAR(" .
+                    32 * C\EMBEDDING_VECTOR_SIZE . "), PARENT_ID $integer,
                 PRIMARY KEY(ID, ITEM_TYPE))",
             "RECOMMENDATION_TERM_EMBEDDING" => "CREATE TABLE
                 RECOMMENDATION_TERM_EMBEDDING (ID $integer NOT NULL,
-                ITEM_TYPE $integer NOT NULL, VECTOR $blob,
+                ITEM_TYPE $integer NOT NULL, VECTOR VARCHAR(" .
+                    32 * C\EMBEDDING_VECTOR_SIZE . "),
                 PRIMARY KEY(ID, ITEM_TYPE))",
             "SCRAPER" =>
                 "CREATE TABLE SCRAPER (ID $serial PRIMARY KEY
ViewGit