Fix recommendation job issues for pack-unpack functions, r=chris

Parth Patel [2022-12-12 03:Dec:th]
Fix recommendation job issues for pack-unpack functions, r=chris

Signed-off-by: Chris Pollett <chris@pollett.org>
Filename
src/library/LRUCache.php
src/library/media_jobs/RecommendationJob.php
diff --git a/src/library/LRUCache.php b/src/library/LRUCache.php
index db3ed8231..bc0077556 100644
--- a/src/library/LRUCache.php
+++ b/src/library/LRUCache.php
@@ -98,4 +98,13 @@ class LRUCache
             return $this->cache[$key];
         }
     }
-}
+    /**
+     * Returns all the items currently in cache
+     *
+     * @return array $this->cache
+     */
+    public function getAll()
+    {
+        return $this->cache;
+    }
+}
\ No newline at end of file
diff --git a/src/library/media_jobs/RecommendationJob.php b/src/library/media_jobs/RecommendationJob.php
index f772f88ac..d96a15f6e 100644
--- a/src/library/media_jobs/RecommendationJob.php
+++ b/src/library/media_jobs/RecommendationJob.php
@@ -330,7 +330,8 @@ class RecommendationJob extends MediaJob
         $term_embeddings = [];
         $item_terms = [];
         while ($row = $db->fetchArray($results)) {
-            $this->lru_cache->put($row['ID'], unserialize($row['VECTOR']));
+            $this->lru_cache->put($row['ID'],
+                base64_decode($row['VECTOR'], true));
         }
         $context_distance_sum = (self::CONTEXT_WINDOW_LENGTH *
             (self::CONTEXT_WINDOW_LENGTH + 1)) / 2.0;
@@ -353,35 +354,35 @@ class RecommendationJob extends MediaJob
             $item_terms[$item_id] = [$terms, $row['GROUP_ID']];
             for ($i = 0; $i < count($terms); $i++) {
                 [$term_id, $term] = $terms[$i];
-                $term_hash = $term_id % self::EMBEDDING_VECTOR_SIZE;
+                $term_hash = $term_id % self::EMBEDDING_VECTOR_SIZE + 1;
                 $term_sign_hash = hash(self::SIGN_HASH_ALGORITHM, $term, true);
-                $term_sign = unpack('n', $term_sign_hash)[1] % 2 == 0 ? -1 : 1;
+                $term_sign = unpack('N', $term_sign_hash)[1] % 2 == 0 ? -1 : 1;
                 $term_embedding = $this->getTermEmbedding($term_id,
                     C\THREAD_RECOMMENDATION);
-                $term_embedding = unpack("d$this->size", $term_embedding);
+                $term_embedding = unpack("E*", $term_embedding);
                 for ($j = $i - 1; $j >= 0 &&
                     $j >= $i - self::CONTEXT_WINDOW_LENGTH; $j--) {
                     [$context_term_id, $context_term] = $terms[$j];
                     $context_term_embedding = $this->getTermEmbedding(
                         $context_term_id, C\THREAD_RECOMMENDATION);
-                    $context_term_embedding = unpack("d$this->size",
+                    $context_term_embedding = unpack("E*",
                         $context_term_embedding);
                     $weight = exp(-1 * pow(($i - $j) / $std_deviation, 2));
                     $context_term_hash = $context_term_id %
-                        self::EMBEDDING_VECTOR_SIZE;
+                        self::EMBEDDING_VECTOR_SIZE + 1;
                     $context_term_sign_hash = hash(self::SIGN_HASH_ALGORITHM,
                         $context_term, true);
-                    $context_term_sign = unpack('n', $context_term_sign_hash)[1]
+                    $context_term_sign = unpack('N', $context_term_sign_hash)[1]
                         % 2 == 0 ? -1 : 1;
                     $term_embedding[$context_term_hash] +=
                         $context_term_sign * $weight;
                     $context_term_embedding[$term_hash] += $term_sign * $weight;
-                    $context_term_embedding = pack("d$this->size",
+                    $context_term_embedding = pack("E*",
                         ...$context_term_embedding);
                     $this->updateTermEmbeddingCache($context_term_id,
                         $context_term_embedding, C\THREAD_RECOMMENDATION);
                 }
-                $term_embedding = pack("d$this->size", ...$term_embedding);
+                $term_embedding = pack("E*", ...$term_embedding);
                 $this->updateTermEmbeddingCache($term_id, $term_embedding,
                     C\THREAD_RECOMMENDATION);
             }
@@ -402,16 +403,16 @@ class RecommendationJob extends MediaJob
         $db = $this->db;
         $updated_item_embeddings = [];
         foreach ($item_terms as $item_id => [$terms, $group_id]) {
-            $item_embedding = array_fill(0, self::EMBEDDING_VECTOR_SIZE, 0);
+            $item_embedding = array_fill(1, self::EMBEDDING_VECTOR_SIZE, 0);
             foreach ($terms as [$term_id, $term]) {
                 $term_embedding = $this->getTermEmbedding($term_id,
                     C\THREAD_RECOMMENDATION, true);
-                $term_embedding = unpack("d$this->size", $term_embedding);
+                $term_embedding = unpack("E*", $term_embedding);
                 $item_embedding = LinearAlgebra::add($item_embedding,
                     $term_embedding);
             }
             $item_embedding = LinearAlgebra::normalize($item_embedding);
-            $item_embedding = pack("d$this->size", ...$item_embedding);
+            $item_embedding = pack("E*", ...$item_embedding);
             $updated_item_embeddings[$item_id] = [$item_embedding, $group_id];
         }
         $base_delete_sql = "DELETE FROM RECOMMENDATION_ITEM_EMBEDDING" .
@@ -424,7 +425,7 @@ class RecommendationJob extends MediaJob
         $item_type = C\THREAD_RECOMMENDATION;
         foreach ($updated_item_embeddings as
             $item_id => [$embedding, $parent_id]) {
-            $embedding = serialize(unpack("d$this->size", $embedding));
+            $embedding = base64_encode($embedding);
             $insert_sql .= "$comma($item_id, $item_type, " .
                 "'$embedding', $parent_id)";
             $delete_sql .= "$comma $item_id";
@@ -478,12 +479,12 @@ class RecommendationJob extends MediaJob
             $user_id = $row['USER_ID'];
             $item_ids = explode(",", $row['ITEM_IDS']);
             $item_ids = array_unique($item_ids);
-            $item_user_embeddings[$user_id] = array_fill(0,
+            $item_user_embeddings[$user_id] = array_fill(1,
                 self::EMBEDDING_VECTOR_SIZE, 0);
             $user_items[$user_id] = [];
             foreach ($item_ids as $item_id) {
                 if (array_key_exists($item_id, $item_embeddings)) {
-                    $item_embedding = unpack("d$this->size",
+                    $item_embedding = unpack("E*",
                         $item_embeddings[$item_id][0]);
                     $item_user_embeddings[$user_id] = LinearAlgebra::add(
                         $item_user_embeddings[$user_id], $item_embedding);
@@ -492,7 +493,7 @@ class RecommendationJob extends MediaJob
             }
             $item_user_embeddings[$user_id] = LinearAlgebra::normalize(
                 $item_user_embeddings[$user_id]);
-            $item_user_embeddings[$user_id] = pack("d$this->size",
+            $item_user_embeddings[$user_id] = pack("E*",
                 ...$item_user_embeddings[$user_id]);
         }
         return [$item_user_embeddings, $user_items];
@@ -526,7 +527,7 @@ class RecommendationJob extends MediaJob
         }
         $item_user_recommendations = [];
         foreach ($item_user_embeddings as $user_id => $embedding) {
-            $embedding = unpack("d$this->size", $embedding);
+            $embedding = unpack("E*", $embedding);
             if (array_key_exists($user_id, $user_groups)) {
                 foreach ($item_embeddings as
                     $item_id => [$item_embedding, $parent_id]) {
@@ -534,7 +535,7 @@ class RecommendationJob extends MediaJob
                         !in_array($parent_id, $user_groups[$user_id])) {
                         continue;
                     }
-                    $item_embedding = unpack("d$this->size", $item_embedding);
+                    $item_embedding = unpack("E*", $item_embedding);
                     $similarity = LinearAlgebra::similarity(
                         $item_embedding, $embedding);
                     $item_user_recommendations[] = [$user_id,
@@ -584,19 +585,19 @@ class RecommendationJob extends MediaJob
         $updated_group_embeddings = [];
         foreach ($item_embeddings as $item_id => [$embedding, $parent_id]) {
             if (array_key_exists($parent_id, $updated_group_embeddings)) {
-                $embedding = unpack("d$this->size", $embedding);
-                $group_embedding = unpack("d$this->size",
+                $embedding = unpack("E*", $embedding);
+                $group_embedding = unpack("E*",
                     $updated_group_embeddings[$parent_id]);
-                $updated_group_embeddings[$parent_id] = pack("d$this->size",
+                $updated_group_embeddings[$parent_id] = pack("E*",
                     ...LinearAlgebra::add($embedding, $group_embedding));
             } else {
                 $updated_group_embeddings[$parent_id] = $embedding;
             }
         }
         foreach ($updated_group_embeddings as $group_id => $embedding) {
-            $embedding = unpack("d$this->size", $embedding);
+            $embedding = unpack("E*", $embedding);
             $embedding = LinearAlgebra::normalize($embedding);
-            $updated_group_embeddings[$group_id] = pack("d$this->size",
+            $updated_group_embeddings[$group_id] = pack("E*",
                 ...$embedding);
         }
         $base_delete_sql = "DELETE FROM RECOMMENDATION_ITEM_EMBEDDING" .
@@ -608,7 +609,7 @@ class RecommendationJob extends MediaJob
         $insert_count = 0;
         $item_type = C\GROUP_RECOMMENDATION;
         foreach ($updated_group_embeddings as $group_id => $embedding) {
-            $embedding = serialize(unpack("d$this->size", $embedding));
+            $embedding = serialize(unpack("E*", $embedding));
             $insert_sql .= "$comma($group_id, $item_type, " .
                 "'$embedding', $group_id)";
             $delete_sql .= "$comma $group_id";
@@ -663,19 +664,19 @@ class RecommendationJob extends MediaJob
             $user_id = $row['USER_ID'];
             $group_ids = explode(",", $row['ITEM_IDS']);
             $group_ids = array_unique($group_ids);
-            $group_user_embeddings[$user_id] = array_fill(0,
+            $group_user_embeddings[$user_id] = array_fill(1,
                 self::EMBEDDING_VECTOR_SIZE, 0);
             $user_groups[$user_id] = [];
             foreach ($group_ids as $group_id) {
                 if (array_key_exists($group_id, $group_embeddings)) {
-                    $embedding = unpack("d$this->size",
+                    $embedding = unpack("E*",
                         $group_embeddings[$group_id]);
                     $group_user_embeddings[$user_id] = LinearAlgebra::add(
                         $group_user_embeddings[$user_id], $embedding);
                     $user_groups[$user_id][] = $group_id;
                 }
             }
-            $group_user_embeddings[$user_id] = pack("d$this->size",
+            $group_user_embeddings[$user_id] = pack("E*",
                 ...LinearAlgebra::normalize($group_user_embeddings[$user_id]));
         }
         return [$group_user_embeddings, $user_groups];
@@ -704,14 +705,14 @@ class RecommendationJob extends MediaJob
         }
         $group_user_recommendations = [];
         foreach ($group_user_embeddings as $user_id => $embedding) {
+            $embedding = unpack("E*", $embedding);
             foreach ($group_embeddings as $group_id => $group_embedding) {
                 if (in_array($group_id, $exclude_group_ids) ||
                     in_array($group_id, $user_groups[$user_id]) ||
                     in_array($group_id, $user_group_impression[$user_id])) {
                     continue;
                 }
-                $embedding = unpack("d$this->size", $embedding);
-                $group_embedding = unpack("d$this->size", $group_embedding);
+                $group_embedding = unpack("E*", $group_embedding);
                 $similarity = LinearAlgebra::similarity($embedding,
                     $group_embedding);
                 $group_user_recommendations[] = [$user_id, $group_id,
@@ -801,6 +802,9 @@ class RecommendationJob extends MediaJob
         $descriptions = [];
         $resource_metadata = [];
         foreach ($thumb_folders as $thumb_folder) {
+            if (empty($thumb_folder)) {
+                continue;
+            }
             list($group_id, $page_id, $folder) = explode("###", $thumb_folder);
             $folder = trim($folder, " \n\r\t\v\x00");
             $files = $this->getDescriptionFiles($folder);
@@ -872,7 +876,8 @@ class RecommendationJob extends MediaJob
         $resource_terms = [];
         $meta_details_terms = [];
         while ($row = $db->fetchArray($results)) {
-            $this->lru_cache->put($row['ID'], unserialize($row['VECTOR']));
+            $this->lru_cache->put($row['ID'],
+                base64_decode($row['VECTOR'], true));
         }
         $context_distance_sum = (self::CONTEXT_WINDOW_LENGTH *
             (self::CONTEXT_WINDOW_LENGTH + 1)) / 2.0;
@@ -902,36 +907,38 @@ class RecommendationJob extends MediaJob
                 $terms = $resource_terms[$resource_id];
                 for ($i = 0; $i < count($terms); $i++) {
                     [$term_id, $term] = $terms[$i];
-                    $term_hash = $term_id % self::EMBEDDING_VECTOR_SIZE;
+                    $term_hash = $term_id % self::EMBEDDING_VECTOR_SIZE + 1;
                     $term_sign_hash = hash(self::SIGN_HASH_ALGORITHM,
                         $term, true);
-                    $term_sign = unpack('n', $term_sign_hash)[1]
+                    $term_sign = unpack('N', $term_sign_hash)[1]
                         % 2 == 0 ? -1 : 1;
                     $term_embedding = $this->getTermEmbedding($term_id,
                         C\RESOURCE_RECOMMENDATION);
-                    $term_embedding = unpack("d$this->size", $term_embedding);
+                    $term_embedding = unpack("E*", $term_embedding);
                     for ($j = $i - 1; $j >= 0 &&
                         $j >= $i - self::CONTEXT_WINDOW_LENGTH; $j--) {
                         [$context_term_id, $context_term] = $terms[$j];
                         $context_term_embedding = $this->getTermEmbedding(
                             $context_term_id, C\RESOURCE_RECOMMENDATION);
-                        $context_term_embedding = unpack("d$this->size",
+                        $context_term_embedding = unpack("E*",
                             $context_term_embedding);
                         $weight = exp(-1 * pow(($i - $j) / $std_deviation, 2));
                         $context_term_hash = $context_term_id %
-                            self::EMBEDDING_VECTOR_SIZE;
+                            self::EMBEDDING_VECTOR_SIZE + 1;
                         $context_term_sign_hash = hash(
                             self::SIGN_HASH_ALGORITHM, $context_term, true);
-                        $context_term_sign = unpack('n',
+                        $context_term_sign = unpack('N',
                             $context_term_sign_hash)[1] % 2 == 0 ? -1 : 1;
                         $term_embedding[$context_term_hash] +=
                             $context_term_sign * $weight;
-                            $context_term_embedding = pack("d$this->size",
+                        $context_term_embedding[$term_hash] +=
+                            $term_sign * $weight;
+                        $context_term_embedding = pack("E*",
                             ...$context_term_embedding);
                         $this->updateTermEmbeddingCache($context_term_id,
                             $context_term_embedding, C\RESOURCE_RECOMMENDATION);
                     }
-                    $term_embedding = pack("d$this->size", ...$term_embedding);
+                    $term_embedding = pack("E*", ...$term_embedding);
                     $this->updateTermEmbeddingCache($term_id, $term_embedding,
                         C\RESOURCE_RECOMMENDATION);
                 }
@@ -974,7 +981,7 @@ class RecommendationJob extends MediaJob
                 in_array($term, self::DESCRIPTION_STOP_WORDS)) {
                 continue;
             }
-            $term_id = unpack('n', md5($term, true))[1];
+            $term_id = unpack('N', md5($term, true))[1];
             $term_ids[] = [$term_id, $term];
         }
         return $term_ids;
@@ -994,22 +1001,22 @@ class RecommendationJob extends MediaJob
         $db = $this->db;
         $updated_item_embeddings = [];
         foreach ($resource_terms as $resource_id => $terms) {
-            $item_embedding = array_fill(0, self::EMBEDDING_VECTOR_SIZE, 0);
+            $item_embedding = array_fill(1, self::EMBEDDING_VECTOR_SIZE, 0);
             foreach ($terms as [$term_id, $term]) {
                 $term_embedding = $this->getTermEmbedding($term_id,
                     C\RESOURCE_RECOMMENDATION, true);
-                $term_embedding = unpack("d$this->size", $term_embedding);
+                $term_embedding = unpack("E*", $term_embedding);
                 $item_embedding = LinearAlgebra::add($item_embedding,
                     $term_embedding);
             }
-            $updated_item_embeddings[$resource_id] = pack("d$this->size",
+            $updated_item_embeddings[$resource_id] = pack("E*",
                 ...$item_embedding);
         }
         foreach ($meta_details_terms as $resource_id => $meta_terms) {
             if (!array_key_exists($resource_id, $updated_item_embeddings)) {
-                $item_embedding = array_fill(0, self::EMBEDDING_VECTOR_SIZE, 0);
+                $item_embedding = array_fill(1, self::EMBEDDING_VECTOR_SIZE, 0);
             } else {
-                $item_embedding = unpack("d$this->size",
+                $item_embedding = unpack("E*",
                     $updated_item_embeddings[$resource_id]);
             }
             foreach ($meta_terms as [$meta_term_id, $meta_term]) {
@@ -1017,16 +1024,16 @@ class RecommendationJob extends MediaJob
                     continue;
                 }
                 $sign_hash = hash(self::SIGN_HASH_ALGORITHM, $meta_term, true);
-                $sign = unpack('n', $sign_hash)[1] % 2 == 0 ? -1 : 1;
-                $item_embedding[$meta_term_id%self::EMBEDDING_VECTOR_SIZE] +=
-                    $sign * 1.0;
+                $sign = unpack('N', $sign_hash)[1] % 2 == 0 ? -1 : 1;
+                $item_embedding[$meta_term_id%self::EMBEDDING_VECTOR_SIZE + 1]
+                    += $sign * 1.0;
             }
-            $updated_item_embeddings[$resource_id] = pack("d$this->size",
+            $updated_item_embeddings[$resource_id] = pack("E*",
                 ...$item_embedding);
         }
         foreach ($updated_item_embeddings as $item_id => $embedding) {
-            $embedding = unpack("d$this->size", $embedding);
-            $updated_item_embeddings[$item_id] = pack("d$this->size",
+            $embedding = unpack("E*", $embedding);
+            $updated_item_embeddings[$item_id] = pack("E*",
                 ...LinearAlgebra::normalize($embedding));
         }
         $delete_sql = "DELETE FROM RECOMMENDATION_ITEM_EMBEDDING WHERE" .
@@ -1038,7 +1045,7 @@ class RecommendationJob extends MediaJob
         $insert_count = 0;
         $item_type = C\RESOURCE_RECOMMENDATION;
         foreach ($updated_item_embeddings as $resource_id => $embedding) {
-            $embedding = serialize(unpack("d$this->size", $embedding));
+            $embedding = base64_encode($embedding);
             $insert_sql .= "$comma($resource_id, $item_type," .
                 " '$embedding', $resource_id)";
             $comma = ",";
@@ -1087,21 +1094,20 @@ class RecommendationJob extends MediaJob
             $user_id = $row['USER_ID'];
             $item_ids = explode(",", $row['ITEM_IDS']);
             $item_ids = array_unique($item_ids);
-            $user_embeddings[$user_id] = array_fill(0,
+            $user_embeddings[$user_id] = array_fill(1,
                 self::EMBEDDING_VECTOR_SIZE, 0);
             $user_items[$user_id] = [];
             foreach ($item_ids as $item_id) {
                 if (array_key_exists($item_id, $item_embeddings)) {
-                    $embedding = unpack("d$this->size",
+                    $embedding = unpack("E*",
                         $item_embeddings[$item_id]);
                     $user_embeddings[$user_id] = LinearAlgebra::add(
                         $user_embeddings[$user_id], $embedding);
                     $user_items[$user_id][] = $item_id;
                 }
             }
-            $user_embeddings[$user_id] = pack("d$this->size",
-                ...LinearAlgebra::normalize(
-                $user_embeddings[$user_id]));
+            $user_embeddings[$user_id] = pack("E*",
+                ...LinearAlgebra::normalize($user_embeddings[$user_id]));
         }
         return [$user_embeddings, $user_items];
     }
@@ -1120,13 +1126,13 @@ class RecommendationJob extends MediaJob
         $db = $this->db;
         $recommendations = [];
         foreach ($user_embeddings as $user_id => $user_embedding) {
-            $user_embedding = unpack("d$this->size", $user_embedding);
+            $user_embedding = unpack("E*", $user_embedding);
             foreach ($item_embeddings as $item_id => $item_embedding) {
                 if (in_array($item_id, $user_items[$user_id]) ||
                     !array_key_exists($item_id, $resource_metadata)) {
                     continue;
                 }
-                $item_embedding = unpack("d$this->size", $item_embedding);
+                $item_embedding = unpack("E*", $item_embedding);
                 $similarity = LinearAlgebra::similarity($user_embedding,
                     $item_embedding);
                 list($group_id, $page_id, $resource_path) =
@@ -1182,10 +1188,10 @@ class RecommendationJob extends MediaJob
             $result = $db->execute($sql, [$item_type, $term_id]);
             $row = $db->fetchArray($result);
             if (!$row) {
-                $term_embedding = pack("d$this->size", ...array_fill(0,
+                $term_embedding = pack("E*", ...array_fill(1,
                     self::EMBEDDING_VECTOR_SIZE, 0.0));
             } else {
-                $term_embedding = unserialize($row['VECTOR']);
+                $term_embedding = base64_decode($row['VECTOR'], true);
             }
         }
         if ($update) {
@@ -1213,7 +1219,7 @@ class RecommendationJob extends MediaJob
             $db->execute($sql, [$item_type, $evicted_item[0]]);
             $sql = "INSERT INTO RECOMMENDATION_TERM_EMBEDDING VALUES (?, ?, ?)";
             $db->execute($sql, [$evicted_item[0], $item_type,
-                serialize($evicted_item[1])]);
+                base64_encode($evicted_item[1])]);
         }
     }
     /**
@@ -1223,6 +1229,7 @@ class RecommendationJob extends MediaJob
      */
     public function saveTermEmbeddingsCacheToDb($item_type)
     {
+        $db = $this->db;
         $base_delete_sql = "DELETE FROM RECOMMENDATION_TERM_EMBEDDING" .
             " WHERE ITEM_TYPE = ? AND ID IN (";
         $delete_sql = $base_delete_sql;
@@ -1230,9 +1237,9 @@ class RecommendationJob extends MediaJob
         $insert_sql = $base_insert_sql;
         $comma = "";
         $insert_count = 0;
-        foreach ($this->lru_cache as $id => $embedding) {
-            $embedding = serialize($embedding);
-            $insert_sql .= "$comma($id, $item_type, $embedding)";
+        foreach ($this->lru_cache->getAll() as $id => $embedding) {
+            $embedding = base64_encode($embedding);
+            $insert_sql .= "$comma($id, $item_type, '$embedding')";
             $delete_sql .= "$comma $id";
             $comma = ",";
             $insert_count++;
ViewGit