diff --git a/src/configs/Config.php b/src/configs/Config.php
index b3d784ca8..306330d75 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -162,7 +162,7 @@ nsconddefine('GENERATOR_STRING', "Yioop");
* Version number for upgrade database function
* @var int
*/
-nsdefine('DATABASE_VERSION', 74);
+nsdefine('DATABASE_VERSION', 75);
/**
* Minimum Version fo Yioop for which keyword ad script
* still works with this version
diff --git a/src/library/LRUCache.php b/src/library/LRUCache.php
new file mode 100644
index 000000000..befac4d9c
--- /dev/null
+++ b/src/library/LRUCache.php
@@ -0,0 +1,98 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2022 Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Parth Patel
+ * @license https://www.gnu.org/licenses/ GPL3
+ * @link https://www.seekquarry.com/
+ * @copyright 2009 - 2022
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\library as L;
+
+/**
+ * Implements a least recently used cache
+ *
+ * @author Parth Patel
+ */
+class LRUCache
+{
+ /**
+ * An associative array that represent cache
+ * @var array
+ */
+ private $cache;
+ /**
+ * Size of the cache
+ * @var int
+ */
+ private $size;
+ /**
+ * Creates an empty cache and sets the size
+ *
+ * @param int $size size of the cache
+ */
+ public function __construct($size = 100)
+ {
+ $this->cache = [];
+ $this->size = $size;
+ }
+ /**
+ * Add or update a key with given value to the cache
+ *
+ * @param mixed $_key
+ * @param mixed $value
+ * @return mixed evicted key-value pair if any
+ */
+ public function put($key, $value)
+ {
+ if (array_key_exists($key, $this->cache)) {
+ unset($this->cache[$key]);
+ $this->cache = [$key => $value] + $this->cache;
+ } else {
+ if (count($this->cache) < $this->size) {
+ $this->cache = [$key => $value] + $this->cache;
+ } else {
+ $evicted_key = array_key_last($this->cache);
+ $evicted_value = $this->cache[$evicted_key];
+ unset($this->cache[$evicted_key]);
+ $this->cache = [$key => $value] + $this->cache;
+ return [$evicted_key, $evicted_value];
+ }
+ }
+ }
+ /**
+ * Returns the value for a given key if found in the cache
+ *
+ * @param mixed $key
+ * @return mixed $value if found
+ */
+ public function get($key)
+ {
+ if (array_key_exists($key, $this->cache)) {
+ return $this->cache[$key];
+ }
+ }
+}
diff --git a/src/library/VersionFunctions.php b/src/library/VersionFunctions.php
index aa982ced7..763a33d73 100644
--- a/src/library/VersionFunctions.php
+++ b/src/library/VersionFunctions.php
@@ -2059,3 +2059,16 @@ function upgradeDatabaseVersion74(&$db)
"PAGE_ID $integer NOT NULL, RESOURCE_PATH VARCHAR(255), SCORE FLOAT, " .
"TIMESTAMP NUMERIC(" . C\TIMESTAMP_LEN . "), RESOURCE_ID $integer)");
}
+/**
+ * Upgrades a Version 74 version of the Yioop database to a Version 75 version
+ * @param object $db datasource to use to upgrade
+ */
+function upgradeDatabaseVersion75(&$db)
+{
+ $dbinfo = ["DBMS" => C\DBMS, "DB_HOST" => C\DB_HOST,
+ "DB_NAME" => C\DB_NAME, "DB_PASSWORD" => C\DB_PASSWORD];
+ $db->execute("CREATE INDEX GI_RECOMMENDATION_INDEX ON " .
+ "GROUP_ITEM_RECOMMENDATION (USER_ID, ITEM_TYPE)");
+ $db->execute("CREATE INDEX GR_RECOMMENDATION_INDEX ON " .
+ "GROUP_RESOURCE_RECOMMENDATION (USER_ID)");
+}
diff --git a/src/library/media_jobs/RecommendationJob.php b/src/library/media_jobs/RecommendationJob.php
index 8ff295a92..f772f88ac 100644
--- a/src/library/media_jobs/RecommendationJob.php
+++ b/src/library/media_jobs/RecommendationJob.php
@@ -34,6 +34,7 @@ namespace seekquarry\yioop\library\media_jobs;
use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\library\LinearAlgebra as LinearAlgebra;
+use seekquarry\yioop\library\LRUCache as LRUCache;
use seekquarry\yioop\library\PhraseParser as PhraseParser;
use seekquarry\yioop\models\CronModel;
@@ -64,6 +65,10 @@ class RecommendationJob extends MediaJob
* @var array
*/
public $user_idf;
+ /**
+ * LRUCache for term embeddings
+ */
+ public $lru_cache;
/**
* Number of inserts to try to group into a single insert statement
* before execution
@@ -102,6 +107,14 @@ class RecommendationJob extends MediaJob
const DESCRIPTION_STOP_WORDS = ["author", "authors", "plot", "genre",
"genres", "star", "stars", "credits", "rating", "ratings", "year",
"director", "cast", "runtime"];
+ /**
+ * Hash algorithm to be used for calculating sign in Hash2Vec term embedding
+ */
+ const SIGN_HASH_ALGORITHM = "crc32";
+ /**
+ * MAX term embeddings fetched from database to initialize LRUCache
+ */
+ const MAX_TERM_EMBEDDINGS = 1000;
/**
* Sets up the database connection so can access tables related
* to recommendations. Initialize timing info related to job.
@@ -116,6 +129,7 @@ class RecommendationJob extends MediaJob
$db_class = C\NS_DATASOURCES . ucfirst(C\DBMS). "Manager";
$this->db = new $db_class();
$this->db->connect();
+ $this->size = self::EMBEDDING_VECTOR_SIZE;
}
/**
* Only update if its been more than an hour since the last update
@@ -263,12 +277,14 @@ class RecommendationJob extends MediaJob
public function computeThreadGroupRecommendations()
{
L\crawlLog("...Start computing Item Term Embeddings...");
- [$term_embeddings, $item_terms] = $this->computeItemTermEmbeddings();
+ $item_terms = $this->computeItemTermEmbeddings();
L\crawlLog("...Finished computing Item Term Embeddings...");
L\crawlLog("...Start computing Item Embeddings...");
- $item_embeddings = $this->computeItemEmbeddings(
- $term_embeddings, $item_terms);
+ $item_embeddings = $this->computeItemEmbeddings($item_terms);
L\crawlLog("...Finished computing Item Embeddings...");
+ L\crawlLog("...Start write back term embeddings from cache to db");
+ $this->saveTermEmbeddingsCacheToDb(C\THREAD_RECOMMENDATION);
+ L\crawlLog("...Finished write back term embeddings from cache to db");
L\crawlLog("...Start computing Item User Embeddings...");
[$item_user_embeddings, $user_items] = $this->
computeItemUserEmbeddings($item_embeddings);
@@ -277,9 +293,12 @@ class RecommendationJob extends MediaJob
$user_groups = $this->computeItemUserRecommendations($item_embeddings,
$item_user_embeddings, $user_items);
L\crawlLog("...Finished computing Item User Recommendations...");
+ unset($item_user_embeddings);
+ unset($user_items);
L\crawlLog("...Start computing Group Embeddings...");
$group_embeddings = $this->computeGroupEmbeddings($item_embeddings);
L\crawlLog("...Finished computing Group Embeddings...");
+ unset($item_embedding);
L\crawlLog("...Start computing Group User Embeddings...");
[$group_user_embeddings, $user_group_impression] =
$this->computeGroupUserEmbeddings($group_embeddings);
@@ -288,6 +307,10 @@ class RecommendationJob extends MediaJob
$this->computeGroupUserRecommendations($group_embeddings,
$group_user_embeddings, $user_groups, $user_group_impression);
L\crawlLog("...Finished computing Group User Recommendations...");
+ unset($group_embeddings);
+ unset($group_user_embeddings);
+ unset($user_group_impression);
+ unset($user_groups);
}
/**
* Computes the term embeddings for individual items (main thread only and
@@ -295,19 +318,19 @@ class RecommendationJob extends MediaJob
* description text. Processes only MAX_GROUP_ITEMS which are either newly
* created or recently edited
*
- * @return array [$term_embeddings, $item_terms] containing embeddings for
- * terms in the items and terms in each item
+ * @return array $item_terms terms in each item
*/
public function computeItemTermEmbeddings()
{
$db = $this->db;
+ $this->lru_cache = new LRUCache(self::MAX_TERM_EMBEDDINGS);
$select_sql = "SELECT * FROM RECOMMENDATION_TERM_EMBEDDING WHERE" .
- " ITEM_TYPE = ?";
+ " ITEM_TYPE = ?" . $db->limitOffset(self::MAX_TERM_EMBEDDINGS);
$results = $db->execute($select_sql, [C\THREAD_RECOMMENDATION]);
$term_embeddings = [];
$item_terms = [];
while ($row = $db->fetchArray($results)) {
- $term_embeddings[$row['ID']] = unserialize($row['VECTOR']);
+ $this->lru_cache->put($row['ID'], unserialize($row['VECTOR']));
}
$context_distance_sum = (self::CONTEXT_WINDOW_LENGTH *
(self::CONTEXT_WINDOW_LENGTH + 1)) / 2.0;
@@ -322,7 +345,6 @@ class RecommendationJob extends MediaJob
" AND TITLE NOT LIKE '%Page%' ORDER BY EDIT_DATE DESC " .
$db->limitOffset(self::MAX_GROUP_ITEMS);
$results = $db->execute($group_item_sql);
- $update_term_embeddings = [];
while ($row = $db->fetchArray($results)) {
$item_id = $row['ID'];
$text_corpus = $row['TITLE'] . " " . $row['DESCRIPTION'];
@@ -332,55 +354,39 @@ class RecommendationJob extends MediaJob
for ($i = 0; $i < count($terms); $i++) {
[$term_id, $term] = $terms[$i];
$term_hash = $term_id % self::EMBEDDING_VECTOR_SIZE;
- if (!array_key_exists($term_id, $term_embeddings)) {
- $term_embeddings[$term_id] = array_fill(0,
- self::EMBEDDING_VECTOR_SIZE, 0);
- }
+ $term_sign_hash = hash(self::SIGN_HASH_ALGORITHM, $term, true);
+ $term_sign = unpack('n', $term_sign_hash)[1] % 2 == 0 ? -1 : 1;
+ $term_embedding = $this->getTermEmbedding($term_id,
+ C\THREAD_RECOMMENDATION);
+ $term_embedding = unpack("d$this->size", $term_embedding);
for ($j = $i - 1; $j >= 0 &&
$j >= $i - self::CONTEXT_WINDOW_LENGTH; $j--) {
[$context_term_id, $context_term] = $terms[$j];
+ $context_term_embedding = $this->getTermEmbedding(
+ $context_term_id, C\THREAD_RECOMMENDATION);
+ $context_term_embedding = unpack("d$this->size",
+ $context_term_embedding);
$weight = exp(-1 * pow(($i - $j) / $std_deviation, 2));
$context_term_hash = $context_term_id %
self::EMBEDDING_VECTOR_SIZE;
- $term_embeddings[$term_id][$context_term_hash] +=
- $weight;
- $term_embeddings[$context_term_id][$term_hash] +=
- $weight;
+ $context_term_sign_hash = hash(self::SIGN_HASH_ALGORITHM,
+ $context_term, true);
+ $context_term_sign = unpack('n', $context_term_sign_hash)[1]
+ % 2 == 0 ? -1 : 1;
+ $term_embedding[$context_term_hash] +=
+ $context_term_sign * $weight;
+ $context_term_embedding[$term_hash] += $term_sign * $weight;
+ $context_term_embedding = pack("d$this->size",
+ ...$context_term_embedding);
+ $this->updateTermEmbeddingCache($context_term_id,
+ $context_term_embedding, C\THREAD_RECOMMENDATION);
}
+ $term_embedding = pack("d$this->size", ...$term_embedding);
+ $this->updateTermEmbeddingCache($term_id, $term_embedding,
+ C\THREAD_RECOMMENDATION);
}
}
- $normalized_term_embeddings = [];
- foreach ($term_embeddings as $term_id => $embedding) {
- $normalized_term_embeddings[$term_id] =
- LinearAlgebra::normalize($embedding);
- }
- $delete_sql = "DELETE FROM RECOMMENDATION_TERM_EMBEDDING" .
- " WHERE ITEM_TYPE = ?";
- $db->execute($delete_sql, [C\THREAD_RECOMMENDATION]);
- $base_insert_sql = "INSERT INTO RECOMMENDATION_TERM_EMBEDDING VALUES ";
- $insert_sql = $base_insert_sql;
- $comma = "";
- $insert_count = 0;
- $item_type = C\THREAD_RECOMMENDATION;
- foreach ($normalized_term_embeddings as $term_id => $embedding) {
- $serialized_embedding = serialize($embedding);
- $insert_sql .= "$comma($term_id, $item_type," .
- " '$serialized_embedding')";
- $comma = ",";
- $insert_count++;
- if ($insert_count == self::BATCH_SQL_INSERT_NUM) {
- $insert_sql = $db->insertIgnore($insert_sql);
- $db->execute($insert_sql);
- $insert_count = 0;
- $comma = "";
- $insert_sql = $base_insert_sql;
- }
- }
- if ($insert_count > 0) {
- $insert_sql = $db->insertIgnore($insert_sql);
- $db->execute($insert_sql);
- }
- return [$term_embeddings, $item_terms];
+ return $item_terms;
}
/**
* Computes the item embeddings for individual items (main thread only and
@@ -388,23 +394,24 @@ class RecommendationJob extends MediaJob
* Additionally fetches the existing item embeddings from database and
* updates them if the term embeddings are updated for their terms
*
- * @param array $term_embeddings embedding for the terms
* @param array $item_terms terms in each item
* @return array $updated_item_embeddings containing embeddings for items
*/
- public function computeItemEmbeddings($term_embeddings, $item_terms)
+ public function computeItemEmbeddings($item_terms)
{
$db = $this->db;
$updated_item_embeddings = [];
foreach ($item_terms as $item_id => [$terms, $group_id]) {
$item_embedding = array_fill(0, self::EMBEDDING_VECTOR_SIZE, 0);
foreach ($terms as [$term_id, $term]) {
- if (array_key_exists($term_id, $term_embeddings)) {
- $item_embedding = LinearAlgebra::add($item_embedding,
- $term_embeddings[$term_id]);
- }
+ $term_embedding = $this->getTermEmbedding($term_id,
+ C\THREAD_RECOMMENDATION, true);
+ $term_embedding = unpack("d$this->size", $term_embedding);
+ $item_embedding = LinearAlgebra::add($item_embedding,
+ $term_embedding);
}
$item_embedding = LinearAlgebra::normalize($item_embedding);
+ $item_embedding = pack("d$this->size", ...$item_embedding);
$updated_item_embeddings[$item_id] = [$item_embedding, $group_id];
}
$base_delete_sql = "DELETE FROM RECOMMENDATION_ITEM_EMBEDDING" .
@@ -417,9 +424,9 @@ class RecommendationJob extends MediaJob
$item_type = C\THREAD_RECOMMENDATION;
foreach ($updated_item_embeddings as
$item_id => [$embedding, $parent_id]) {
- $serialized_embedding = serialize($embedding);
- $insert_sql .= "$comma($item_id, $item_type," .
- " '$serialized_embedding', $parent_id)";
+ $embedding = serialize(unpack("d$this->size", $embedding));
+ $insert_sql .= "$comma($item_id, $item_type, " .
+ "'$embedding', $parent_id)";
$delete_sql .= "$comma $item_id";
$comma = ",";
$insert_count++;
@@ -476,14 +483,17 @@ class RecommendationJob extends MediaJob
$user_items[$user_id] = [];
foreach ($item_ids as $item_id) {
if (array_key_exists($item_id, $item_embeddings)) {
- $item_user_embeddings[$user_id] = LinearAlgebra::add(
- $item_user_embeddings[$user_id],
+ $item_embedding = unpack("d$this->size",
$item_embeddings[$item_id][0]);
+ $item_user_embeddings[$user_id] = LinearAlgebra::add(
+ $item_user_embeddings[$user_id], $item_embedding);
$user_items[$user_id][] = $item_id;
}
}
$item_user_embeddings[$user_id] = LinearAlgebra::normalize(
$item_user_embeddings[$user_id]);
+ $item_user_embeddings[$user_id] = pack("d$this->size",
+ ...$item_user_embeddings[$user_id]);
}
return [$item_user_embeddings, $user_items];
}
@@ -516,21 +526,15 @@ class RecommendationJob extends MediaJob
}
$item_user_recommendations = [];
foreach ($item_user_embeddings as $user_id => $embedding) {
+ $embedding = unpack("d$this->size", $embedding);
if (array_key_exists($user_id, $user_groups)) {
- $user_item = [];
- if (array_key_exists($user_id, $user_items)) {
- $user_item = $user_items[$user_id];
- }
- $user_group = [];
- if (array_key_exists($user_id, $user_groups)) {
- $user_group = $user_groups[$user_id];
- }
foreach ($item_embeddings as
$item_id => [$item_embedding, $parent_id]) {
- if (in_array($item_id, $user_item) ||
- !in_array($parent_id, $user_group)) {
+ if (in_array($item_id, $user_items[$user_id]) ||
+ !in_array($parent_id, $user_groups[$user_id])) {
continue;
}
+ $item_embedding = unpack("d$this->size", $item_embedding);
$similarity = LinearAlgebra::similarity(
$item_embedding, $embedding);
$item_user_recommendations[] = [$user_id,
@@ -547,8 +551,9 @@ class RecommendationJob extends MediaJob
$insert_count = 0;
$item_type = C\THREAD_RECOMMENDATION;
foreach ($item_user_recommendations as $recommendation) {
- $insert_sql .= "$comma({$recommendation[0]}, {$recommendation[1]}" .
- ", $item_type, {$recommendation[2]}, {$this->update_time})";
+ [$user_id, $item_id, $similarity] = $recommendation;
+ $insert_sql .= "$comma($user_id, $item_id" .
+ ", $item_type, $similarity, {$this->update_time})";
$comma = ",";
$insert_count++;
if ($insert_count == self::BATCH_SQL_INSERT_NUM) {
@@ -579,15 +584,20 @@ class RecommendationJob extends MediaJob
$updated_group_embeddings = [];
foreach ($item_embeddings as $item_id => [$embedding, $parent_id]) {
if (array_key_exists($parent_id, $updated_group_embeddings)) {
- $updated_group_embeddings[$parent_id] = LinearAlgebra::add(
- $embedding, $updated_group_embeddings[$parent_id]);
+ $embedding = unpack("d$this->size", $embedding);
+ $group_embedding = unpack("d$this->size",
+ $updated_group_embeddings[$parent_id]);
+ $updated_group_embeddings[$parent_id] = pack("d$this->size",
+ ...LinearAlgebra::add($embedding, $group_embedding));
} else {
$updated_group_embeddings[$parent_id] = $embedding;
}
}
foreach ($updated_group_embeddings as $group_id => $embedding) {
+ $embedding = unpack("d$this->size", $embedding);
$embedding = LinearAlgebra::normalize($embedding);
- $updated_group_embeddings[$group_id] = $embedding;
+ $updated_group_embeddings[$group_id] = pack("d$this->size",
+ ...$embedding);
}
$base_delete_sql = "DELETE FROM RECOMMENDATION_ITEM_EMBEDDING" .
" WHERE ITEM_TYPE = ? AND ID IN (";
@@ -598,9 +608,9 @@ class RecommendationJob extends MediaJob
$insert_count = 0;
$item_type = C\GROUP_RECOMMENDATION;
foreach ($updated_group_embeddings as $group_id => $embedding) {
- $serialized_embedding = serialize($embedding);
- $insert_sql .= "$comma($group_id, $item_type," .
- " '$serialized_embedding', $group_id)";
+ $embedding = serialize(unpack("d$this->size", $embedding));
+ $insert_sql .= "$comma($group_id, $item_type, " .
+ "'$embedding', $group_id)";
$delete_sql .= "$comma $group_id";
$comma = ",";
$insert_count++;
@@ -658,14 +668,15 @@ class RecommendationJob extends MediaJob
$user_groups[$user_id] = [];
foreach ($group_ids as $group_id) {
if (array_key_exists($group_id, $group_embeddings)) {
- $group_user_embeddings[$user_id] = LinearAlgebra::add(
- $group_user_embeddings[$user_id],
+ $embedding = unpack("d$this->size",
$group_embeddings[$group_id]);
+ $group_user_embeddings[$user_id] = LinearAlgebra::add(
+ $group_user_embeddings[$user_id], $embedding);
$user_groups[$user_id][] = $group_id;
}
}
- $group_user_embeddings[$user_id] = LinearAlgebra::normalize(
- $group_user_embeddings[$user_id]);
+ $group_user_embeddings[$user_id] = pack("d$this->size",
+ ...LinearAlgebra::normalize($group_user_embeddings[$user_id]));
}
return [$group_user_embeddings, $user_groups];
}
@@ -693,14 +704,14 @@ class RecommendationJob extends MediaJob
}
$group_user_recommendations = [];
foreach ($group_user_embeddings as $user_id => $embedding) {
- $user_group = $user_groups[$user_id];
- $impression_group = $user_group_impression[$user_id];
foreach ($group_embeddings as $group_id => $group_embedding) {
if (in_array($group_id, $exclude_group_ids) ||
- in_array($group_id, $user_group) ||
- in_array($group_id, $impression_group)) {
+ in_array($group_id, $user_groups[$user_id]) ||
+ in_array($group_id, $user_group_impression[$user_id])) {
continue;
}
+ $embedding = unpack("d$this->size", $embedding);
+ $group_embedding = unpack("d$this->size", $group_embedding);
$similarity = LinearAlgebra::similarity($embedding,
$group_embedding);
$group_user_recommendations[] = [$user_id, $group_id,
@@ -716,8 +727,9 @@ class RecommendationJob extends MediaJob
$insert_count = 0;
$item_type = C\GROUP_RECOMMENDATION;
foreach ($group_user_recommendations as $recommendation) {
- $insert_sql .= "$comma({$recommendation[0]}, {$recommendation[1]}" .
- ", $item_type, {$recommendation[2]}, {$this->update_time})";
+ [$user_id, $group_id, $similarity] = $recommendation;
+ $insert_sql .= "$comma($user_id, $group_id" .
+ ", $item_type, $similarity, {$this->update_time})";
$comma = ",";
$insert_count++;
if ($insert_count == self::BATCH_SQL_INSERT_NUM) {
@@ -747,13 +759,18 @@ class RecommendationJob extends MediaJob
L\crawlLog("...Finished fetching descriptions for the wiki page " .
"resources...");
L\crawlLog("...Start computing wiki term embeddings...");
- [$term_embeddings, $resource_terms, $meta_details_terms] =
+ [$resource_terms, $meta_details_terms] =
$this->computeWikiTermEmbeddings($descriptions);
L\crawlLog("...Finished computing wiki term embeddings...");
L\crawlLog("...Start computing wiki resource embeddings...");
$item_embeddings = $this->computeWikiResourceEmbeddings($resource_terms,
- $meta_details_terms, $term_embeddings);
+ $meta_details_terms);
L\crawlLog("...Finished computing wiki resource embeddings...");
+ unset($resource_terms);
+ unset($meta_details_terms);
+ L\crawlLog("...Start write back term embeddings from cache to db");
+ $this->saveTermEmbeddingsCacheToDb(C\RESOURCE_RECOMMENDATION);
+ L\crawlLog("...Finished write back term embeddings from cache to db");
L\crawlLog("...Start computing wiki user embeddings...");
[$user_embeddings, $user_items] = $this->computeWikiUserEmbeddings(
$item_embeddings);
@@ -762,6 +779,10 @@ class RecommendationJob extends MediaJob
$this->computeWikiUserRecommendations($item_embeddings,
$user_embeddings, $user_items, $resource_metadata);
L\crawlLog("...Done computing wiki resource recommendations...");
+ unset($user_embeddings);
+ unset($user_items);
+ unset($item_embeddings);
+ unset($resource_metadata);
}
/**
* Fetches the description for the eligible wiki resources having the root
@@ -839,22 +860,19 @@ class RecommendationJob extends MediaJob
* approach
*
* @param array $descriptions of resources
- * @return array [$term_embeddings, $resource_terms, $meta_details_term]
- * first with key being term id and value is the embedding vector for that
- * term, second with key being resource id and value being array of clean
- * terms in that resource description
+ * @return array [$resource_terms, $meta_details_term]
*/
public function computeWikiTermEmbeddings($descriptions)
{
$db = $this->db;
+ $this->lru_cache = new LRUCache(self::MAX_TERM_EMBEDDINGS);
$select_sql = "SELECT * FROM RECOMMENDATION_TERM_EMBEDDING WHERE" .
- " ITEM_TYPE = ?";
+ " ITEM_TYPE = ?" . $db->limitOffset(self::MAX_TERM_EMBEDDINGS);
$results = $db->execute($select_sql, [C\RESOURCE_RECOMMENDATION]);
- $term_embeddings = [];
$resource_terms = [];
$meta_details_terms = [];
while ($row = $db->fetchArray($results)) {
- $term_embeddings[$row['ID']] = unserialize($row['VECTOR']);
+ $this->lru_cache->put($row['ID'], unserialize($row['VECTOR']));
}
$context_distance_sum = (self::CONTEXT_WINDOW_LENGTH *
(self::CONTEXT_WINDOW_LENGTH + 1)) / 2.0;
@@ -885,51 +903,41 @@ class RecommendationJob extends MediaJob
for ($i = 0; $i < count($terms); $i++) {
[$term_id, $term] = $terms[$i];
$term_hash = $term_id % self::EMBEDDING_VECTOR_SIZE;
- if (!array_key_exists($term_id, $term_embeddings)) {
- $term_embeddings[$term_id] = array_fill(0,
- self::EMBEDDING_VECTOR_SIZE, 0);
- }
+ $term_sign_hash = hash(self::SIGN_HASH_ALGORITHM,
+ $term, true);
+ $term_sign = unpack('n', $term_sign_hash)[1]
+ % 2 == 0 ? -1 : 1;
+ $term_embedding = $this->getTermEmbedding($term_id,
+ C\RESOURCE_RECOMMENDATION);
+ $term_embedding = unpack("d$this->size", $term_embedding);
for ($j = $i - 1; $j >= 0 &&
$j >= $i - self::CONTEXT_WINDOW_LENGTH; $j--) {
[$context_term_id, $context_term] = $terms[$j];
+ $context_term_embedding = $this->getTermEmbedding(
+ $context_term_id, C\RESOURCE_RECOMMENDATION);
+ $context_term_embedding = unpack("d$this->size",
+ $context_term_embedding);
$weight = exp(-1 * pow(($i - $j) / $std_deviation, 2));
$context_term_hash = $context_term_id %
self::EMBEDDING_VECTOR_SIZE;
- $term_embeddings[$term_id][$context_term_hash] +=
- $weight;
- $term_embeddings[$context_term_id][$term_hash] +=
- $weight;
+ $context_term_sign_hash = hash(
+ self::SIGN_HASH_ALGORITHM, $context_term, true);
+ $context_term_sign = unpack('n',
+ $context_term_sign_hash)[1] % 2 == 0 ? -1 : 1;
+ $term_embedding[$context_term_hash] +=
+ $context_term_sign * $weight;
+ $context_term_embedding = pack("d$this->size",
+ ...$context_term_embedding);
+ $this->updateTermEmbeddingCache($context_term_id,
+ $context_term_embedding, C\RESOURCE_RECOMMENDATION);
}
+ $term_embedding = pack("d$this->size", ...$term_embedding);
+ $this->updateTermEmbeddingCache($term_id, $term_embedding,
+ C\RESOURCE_RECOMMENDATION);
}
}
}
- $delete_sql = "DELETE FROM RECOMMENDATION_TERM_EMBEDDING WHERE" .
- " ITEM_TYPE = ?";
- $db->execute($delete_sql, [C\RESOURCE_RECOMMENDATION]);
- $base_insert_sql = "INSERT INTO RECOMMENDATION_TERM_EMBEDDING VALUES ";
- $insert_sql = $base_insert_sql;
- $comma = "";
- $insert_count = 0;
- $item_type = C\RESOURCE_RECOMMENDATION;
- foreach ($term_embeddings as $term_id => $embedding) {
- $serialized_embedding = serialize($embedding);
- $insert_sql .= "$comma($term_id, $item_type, " .
- "'$serialized_embedding')";
- $comma = ",";
- $insert_count++;
- if ($insert_count == self::BATCH_SQL_INSERT_NUM) {
- $insert_sql = $db->insertIgnore($insert_sql);
- $db->execute($insert_sql);
- $insert_count = 0;
- $comma = "";
- $insert_sql = $base_insert_sql;
- }
- }
- if ($insert_count > 0) {
- $insert_sql = $db->insertIgnore($insert_sql);
- $db->execute($insert_sql);
- }
- return [$term_embeddings, $resource_terms, $meta_details_terms];
+ return [$resource_terms, $meta_details_terms];
}
/**
* Split the given text into terms, clean the terms by removing non
@@ -977,40 +985,49 @@ class RecommendationJob extends MediaJob
*
* @param array $resource_terms of processed terms from resource description
* @param array $meta_details_terms of raw resource descriptions
- * @param array $term_embeddings of term embeddings
* @return array $updated_item_embeddings array of updated wiki resource
* embeddings
*/
public function computeWikiResourceEmbeddings($resource_terms,
- $meta_details_terms, $term_embeddings)
+ $meta_details_terms)
{
$db = $this->db;
$updated_item_embeddings = [];
foreach ($resource_terms as $resource_id => $terms) {
$item_embedding = array_fill(0, self::EMBEDDING_VECTOR_SIZE, 0);
foreach ($terms as [$term_id, $term]) {
- if (array_key_exists($term_id, $term_embeddings)) {
- $item_embedding = LinearAlgebra::add($item_embedding,
- $term_embeddings[$term_id]);
- }
+ $term_embedding = $this->getTermEmbedding($term_id,
+ C\RESOURCE_RECOMMENDATION, true);
+ $term_embedding = unpack("d$this->size", $term_embedding);
+ $item_embedding = LinearAlgebra::add($item_embedding,
+ $term_embedding);
}
- $updated_item_embeddings[$resource_id] = $item_embedding;
+ $updated_item_embeddings[$resource_id] = pack("d$this->size",
+ ...$item_embedding);
}
foreach ($meta_details_terms as $resource_id => $meta_terms) {
if (!array_key_exists($resource_id, $updated_item_embeddings)) {
- $updated_item_embeddings[$resource_id] = array_fill(0,
- self::EMBEDDING_VECTOR_SIZE, 0);
+ $item_embedding = array_fill(0, self::EMBEDDING_VECTOR_SIZE, 0);
+ } else {
+ $item_embedding = unpack("d$this->size",
+ $updated_item_embeddings[$resource_id]);
}
foreach ($meta_terms as [$meta_term_id, $meta_term]) {
if (strlen($meta_term) <= 1) {
continue;
}
- $updated_item_embeddings[$resource_id][$meta_term_id] += 1.;
+ $sign_hash = hash(self::SIGN_HASH_ALGORITHM, $meta_term, true);
+ $sign = unpack('n', $sign_hash)[1] % 2 == 0 ? -1 : 1;
+ $item_embedding[$meta_term_id%self::EMBEDDING_VECTOR_SIZE] +=
+ $sign * 1.0;
}
+ $updated_item_embeddings[$resource_id] = pack("d$this->size",
+ ...$item_embedding);
}
foreach ($updated_item_embeddings as $item_id => $embedding) {
- $updated_item_embeddings[$item_id] = LinearAlgebra::normalize(
- $embedding);
+ $embedding = unpack("d$this->size", $embedding);
+ $updated_item_embeddings[$item_id] = pack("d$this->size",
+ ...LinearAlgebra::normalize($embedding));
}
$delete_sql = "DELETE FROM RECOMMENDATION_ITEM_EMBEDDING WHERE" .
" ITEM_TYPE = ?";
@@ -1021,9 +1038,9 @@ class RecommendationJob extends MediaJob
$insert_count = 0;
$item_type = C\RESOURCE_RECOMMENDATION;
foreach ($updated_item_embeddings as $resource_id => $embedding) {
- $serialized_embedding = serialize($embedding);
+ $embedding = serialize(unpack("d$this->size", $embedding));
$insert_sql .= "$comma($resource_id, $item_type," .
- " '$serialized_embedding', $resource_id)";
+ " '$embedding', $resource_id)";
$comma = ",";
$insert_count++;
if ($insert_count == self::BATCH_SQL_INSERT_NUM) {
@@ -1075,13 +1092,16 @@ class RecommendationJob extends MediaJob
$user_items[$user_id] = [];
foreach ($item_ids as $item_id) {
if (array_key_exists($item_id, $item_embeddings)) {
+ $embedding = unpack("d$this->size",
+ $item_embeddings[$item_id]);
$user_embeddings[$user_id] = LinearAlgebra::add(
- $user_embeddings[$user_id], $item_embeddings[$item_id]);
+ $user_embeddings[$user_id], $embedding);
$user_items[$user_id][] = $item_id;
}
}
- $user_embeddings[$user_id] = LinearAlgebra::normalize(
- $user_embeddings[$user_id]);
+ $user_embeddings[$user_id] = pack("d$this->size",
+ ...LinearAlgebra::normalize(
+ $user_embeddings[$user_id]));
}
return [$user_embeddings, $user_items];
}
@@ -1100,20 +1120,18 @@ class RecommendationJob extends MediaJob
$db = $this->db;
$recommendations = [];
foreach ($user_embeddings as $user_id => $user_embedding) {
- if (array_key_exists($user_id, $user_items)) {
- $user_item = $user_items[$user_id];
- } else {
- $user_item = [];
- }
+ $user_embedding = unpack("d$this->size", $user_embedding);
foreach ($item_embeddings as $item_id => $item_embedding) {
- if (in_array($item_id, $user_item) ||
+ if (in_array($item_id, $user_items[$user_id]) ||
!array_key_exists($item_id, $resource_metadata)) {
continue;
}
+ $item_embedding = unpack("d$this->size", $item_embedding);
$similarity = LinearAlgebra::similarity($user_embedding,
$item_embedding);
list($group_id, $page_id, $resource_path) =
$resource_metadata[$item_id];
+ unset($resource_metadata[$item_id]);
$recommendations[] = [$user_id, $group_id, $page_id,
$resource_path, $similarity, $item_id];
}
@@ -1146,4 +1164,95 @@ class RecommendationJob extends MediaJob
$db->execute($insert_sql);
}
}
+ /**
+ * Returns the term embedding either from LRU cache or database
+ *
+ * @param int $term_id
+ * @param int $item_type
+ * @param boolean $update indicates whether to update the cache
+ * @return string $term_embedding
+ */
+ public function getTermEmbedding($term_id, $item_type, $update = false)
+ {
+ $db = $this->db;
+ $term_embedding = $this->lru_cache->get($term_id);
+ if (!isset($term_embedding)) {
+ $sql = "SELECT VECTOR FROM RECOMMENDATION_TERM_EMBEDDING " .
+ "WHERE ITEM_TYPE = ? AND ID = ?";
+ $result = $db->execute($sql, [$item_type, $term_id]);
+ $row = $db->fetchArray($result);
+ if (!$row) {
+ $term_embedding = pack("d$this->size", ...array_fill(0,
+ self::EMBEDDING_VECTOR_SIZE, 0.0));
+ } else {
+ $term_embedding = unserialize($row['VECTOR']);
+ }
+ }
+ if ($update) {
+ $this->updateTermEmbeddingCache($term_id, $term_embedding,
+ $item_type);
+ }
+ return $term_embedding;
+ }
+ /**
+ * Updates LRU cache of term embeddings and save the evicted
+ * embedding back to database
+ *
+ * @param int $term_id
+ * @param string $term_embedding
+ * @param int $item_type
+ */
+ public function updateTermEmbeddingCache($term_id, $term_embedding,
+ $item_type)
+ {
+ $db = $this->db;
+ $evicted_item = $this->lru_cache->put($term_id, $term_embedding);
+ if (isset($evicted_item)) {
+ $sql = "DELETE FROM RECOMMENDATION_TERM_EMBEDDING " .
+ "WHERE ITEM_TYPE = ? AND ID = ?";
+ $db->execute($sql, [$item_type, $evicted_item[0]]);
+ $sql = "INSERT INTO RECOMMENDATION_TERM_EMBEDDING VALUES (?, ?, ?)";
+ $db->execute($sql, [$evicted_item[0], $item_type,
+ serialize($evicted_item[1])]);
+ }
+ }
+ /**
+ * Writes back the term embeddings in cache to database and free up memory
+ *
+ * @param int $item_type value for ITEM_TYPE column
+ */
+ public function saveTermEmbeddingsCacheToDb($item_type)
+ {
+ $base_delete_sql = "DELETE FROM RECOMMENDATION_TERM_EMBEDDING" .
+ " WHERE ITEM_TYPE = ? AND ID IN (";
+ $delete_sql = $base_delete_sql;
+ $base_insert_sql = "INSERT INTO RECOMMENDATION_TERM_EMBEDDING VALUES ";
+ $insert_sql = $base_insert_sql;
+ $comma = "";
+ $insert_count = 0;
+ foreach ($this->lru_cache as $id => $embedding) {
+ $embedding = serialize($embedding);
+ $insert_sql .= "$comma($id, $item_type, $embedding)";
+ $delete_sql .= "$comma $id";
+ $comma = ",";
+ $insert_count++;
+ if ($insert_count == self::BATCH_SQL_INSERT_NUM) {
+ $delete_sql .= ")";
+ $db->execute($delete_sql, [$item_type]);
+ $insert_sql = $db->insertIgnore($insert_sql);
+ $db->execute($insert_sql);
+ $insert_count = 0;
+ $comma = "";
+ $delete_sql = $base_delete_sql;
+ $insert_sql = $base_insert_sql;
+ }
+ }
+ if ($insert_count > 0) {
+ $delete_sql .= ")";
+ $db->execute($delete_sql, [$item_type]);
+ $insert_sql = $db->insertIgnore($insert_sql);
+ $db->execute($insert_sql);
+ }
+ unset($this->lru_cache);
+ }
}
diff --git a/src/models/ProfileModel.php b/src/models/ProfileModel.php
index 2b0f0d3ce..55a320906 100755
--- a/src/models/ProfileModel.php
+++ b/src/models/ProfileModel.php
@@ -215,6 +215,8 @@ class ProfileModel extends Model
GROUP_ITEM_RECOMMENDATION (USER_ID $integer NOT NULL,
ITEM_ID $integer NOT NULL, ITEM_TYPE $integer NOT NULL,
SCORE FLOAT, TIMESTAMP NUMERIC(" . C\TIMESTAMP_LEN . "))",
+ "GI_RECOMMENDATION_INDEX" => "CREATE INDEX GI_RECOMMENDATION_INDEX
+ ON GROUP_ITEM_RECOMMENDATION (USER_ID, ITEM_TYPE)",
"GROUP_PAGE" => "CREATE TABLE GROUP_PAGE (
ID $serial PRIMARY KEY $auto_increment, GROUP_ID $integer,
DISCUSS_THREAD $integer, TITLE VARCHAR(" . C\TITLE_LEN . "),
@@ -247,6 +249,8 @@ class ProfileModel extends Model
GROUP_ID $integer NOT NULL, PAGE_ID $integer NOT NULL,
RESOURCE_PATH VARCHAR(255), SCORE FLOAT, TIMESTAMP NUMERIC(" .
C\TIMESTAMP_LEN . "), RESOURCE_ID $integer)",
+ "GR_RECOMMENDATION_INDEX" => "CREATE INDEX GR_RECOMMENDATION_INDEX
+ ON GROUP_RESOURCE_RECOMMENDATION (USER_ID)",
"SOCIAL_GROUPS" => "CREATE TABLE SOCIAL_GROUPS (
GROUP_ID $serial PRIMARY KEY $auto_increment,
GROUP_NAME VARCHAR(" . C\SHORT_TITLE_LEN