diff --git a/src/configs/Config.php b/src/configs/Config.php
index 662b6ae93..7aa3ba2ba 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -162,7 +162,7 @@ nsconddefine('GENERATOR_STRING', "Yioop");
* Version number for upgrade database function
* @var int
*/
-nsdefine('DATABASE_VERSION', 76);
+nsdefine('DATABASE_VERSION', 77);
/**
* Minimum Version fo Yioop for which keyword ad script
* still works with this version
diff --git a/src/library/VersionFunctions.php b/src/library/VersionFunctions.php
index dac1149f4..c66ef3531 100644
--- a/src/library/VersionFunctions.php
+++ b/src/library/VersionFunctions.php
@@ -2073,7 +2073,7 @@ function upgradeDatabaseVersion75(&$db)
"GROUP_RESOURCE_RECOMMENDATION (USER_ID)");
}
/**
- * Upgrades a Version 76 version of the Yioop database to a Version 75 version
+ * Upgrades a Version 75 version of the Yioop database to a Version 76 version
* @param object $db datasource to use to upgrade
*/
function upgradeDatabaseVersion76(&$db)
@@ -2092,3 +2092,17 @@ function upgradeDatabaseVersion76(&$db)
32 * C\EMBEDDING_VECTOR_SIZE . "), " .
"PARENT_ID $integer, PRIMARY KEY(ID, ITEM_TYPE))");
}
+/**
+ * Upgrades a Version 76 version of the Yioop database to a Version 77 version
+ * @param object $db datasource to use to upgrade
+ */
+function upgradeDatabaseVersion77(&$db)
+{
+ $dbinfo = ["DBMS" => C\DBMS, "DB_HOST" => C\DB_HOST,
+ "DB_NAME" => C\DB_NAME, "DB_PASSWORD" => C\DB_PASSWORD];
+ $integer = $db->integerType($dbinfo);
+ $db->execute("DROP TABLE RECOMMENDATION_TERM_EMBEDDING");
+ $db->execute("CREATE TABLE RECOMMENDATION_TERM_EMBEDDING " .
+ "(ID VARCHAR(16) NOT NULL, ITEM_TYPE $integer NOT NULL, VECTOR VARCHAR("
+ . 32 * C\EMBEDDING_VECTOR_SIZE . "), PRIMARY KEY(ID, ITEM_TYPE))");
+}
diff --git a/src/library/media_jobs/RecommendationJob.php b/src/library/media_jobs/RecommendationJob.php
index 920442ca2..061052364 100644
--- a/src/library/media_jobs/RecommendationJob.php
+++ b/src/library/media_jobs/RecommendationJob.php
@@ -98,6 +98,10 @@ class RecommendationJob extends MediaJob
const DESCRIPTION_STOP_WORDS = ["author", "authors", "plot", "genre",
"genres", "star", "stars", "credits", "rating", "ratings", "year",
"director", "cast", "runtime"];
+ /**
+ * Hash algorithm to be used for calculating hash in Hash2Vec embedding
+ */
+ const HASH_ALGORITHM = "md5";
/**
* Hash algorithm to be used for calculating sign in Hash2Vec term embedding
*/
@@ -320,7 +324,7 @@ class RecommendationJob extends MediaJob
$db = $this->db;
$this->lru_cache = new LRUCache(self::MAX_TERM_EMBEDDINGS);
$select_sql = "SELECT * FROM RECOMMENDATION_TERM_EMBEDDING WHERE" .
- " ITEM_TYPE = ?" . $db->limitOffset(self::MAX_TERM_EMBEDDINGS);
+ " ITEM_TYPE = ? " . $db->limitOffset(self::MAX_TERM_EMBEDDINGS);
$results = $db->execute($select_sql, [C\THREAD_RECOMMENDATION]);
$term_embeddings = [];
$item_terms = [];
@@ -359,7 +363,8 @@ class RecommendationJob extends MediaJob
for ($i = 0; $i < $num_terms; $i++) {
L\crawlTimeoutLog("Have processed $i of $num_terms terms");
[$term_id, $term] = $terms[$i];
- $term_hash = $term_id % C\EMBEDDING_VECTOR_SIZE + 1;
+ $term_hash = unpack('N', hash(self::HASH_ALGORITHM, $term, true)
+ )[1] % C\EMBEDDING_VECTOR_SIZE + 1;
$term_sign_hash = hash(self::SIGN_HASH_ALGORITHM, $term, true);
$term_sign = unpack('N', $term_sign_hash)[1] % 2 == 0 ? -1 : 1;
$term_embedding = $this->getTermEmbedding($term_id,
@@ -373,8 +378,8 @@ class RecommendationJob extends MediaJob
$context_term_embedding = unpack("E*",
$context_term_embedding);
$weight = exp(-1 * pow(($i - $j) / $std_deviation, 2));
- $context_term_hash = $context_term_id %
- C\EMBEDDING_VECTOR_SIZE + 1;
+ $context_term_hash = unpack('N', hash(self::HASH_ALGORITHM,
+ $context_term, true))[1] % C\EMBEDDING_VECTOR_SIZE + 1;
$context_term_sign_hash = hash(self::SIGN_HASH_ALGORITHM,
$context_term, true);
$context_term_sign = unpack('N', $context_term_sign_hash)[1]
@@ -932,7 +937,7 @@ class RecommendationJob extends MediaJob
$db = $this->db;
$this->lru_cache = new LRUCache(self::MAX_TERM_EMBEDDINGS);
$select_sql = "SELECT * FROM RECOMMENDATION_TERM_EMBEDDING WHERE " .
- "ITEM_TYPE = ?" . $db->limitOffset(self::MAX_TERM_EMBEDDINGS);
+ "ITEM_TYPE = ? " . $db->limitOffset(self::MAX_TERM_EMBEDDINGS);
$results = $db->execute($select_sql, [C\RESOURCE_RECOMMENDATION]);
$resource_terms = [];
$meta_details_terms = [];
@@ -976,7 +981,8 @@ class RecommendationJob extends MediaJob
for ($i = 0; $i < $num_terms; $i++) {
L\crawlTimeoutLog("Have processed $i of $num_terms terms");
[$term_id, $term] = $terms[$i];
- $term_hash = $term_id % C\EMBEDDING_VECTOR_SIZE + 1;
+ $term_hash = unpack('N', hash(self::HASH_ALGORITHM, $term,
+ true))[1] % C\EMBEDDING_VECTOR_SIZE + 1;
$term_sign_hash = hash(self::SIGN_HASH_ALGORITHM,
$term, true);
$term_sign = unpack('N', $term_sign_hash)[1]
@@ -992,7 +998,8 @@ class RecommendationJob extends MediaJob
$context_term_embedding = unpack("E*",
$context_term_embedding);
$weight = exp(-1 * pow(($i - $j) / $std_deviation, 2));
- $context_term_hash = $context_term_id %
+ $context_term_hash = unpack('N', hash(
+ self::HASH_ALGORITHM, $context_term, true))[1] %
C\EMBEDDING_VECTOR_SIZE + 1;
$context_term_sign_hash = hash(
self::SIGN_HASH_ALGORITHM, $context_term, true);
@@ -1051,7 +1058,7 @@ class RecommendationJob extends MediaJob
in_array($term, self::DESCRIPTION_STOP_WORDS)) {
continue;
}
- $term_id = unpack('N', md5($term, true))[1];
+ $term_id = L\canonicalTerm($term);
$term_ids[] = [$term_id, $term];
}
return $term_ids;
@@ -1096,10 +1103,11 @@ class RecommendationJob extends MediaJob
if (strlen($meta_term) <= 1) {
continue;
}
+ $meta_term_hash = unpack('N', hash(self::HASH_ALGORITHM,
+ $meta_term, true))[1] % C\EMBEDDING_VECTOR_SIZE + 1;
$sign_hash = hash(self::SIGN_HASH_ALGORITHM, $meta_term, true);
$sign = unpack('N', $sign_hash)[1] % 2 == 0 ? -1 : 1;
- $item_embedding[$meta_term_id % C\EMBEDDING_VECTOR_SIZE + 1]
- += $sign * 1.0;
+ $item_embedding[$meta_term_hash] += $sign * 1.0;
}
$updated_item_embeddings[$resource_id] = pack("E*",
...$item_embedding);
@@ -1337,8 +1345,8 @@ class RecommendationJob extends MediaJob
foreach ($this->lru_cache->getAll() as $id => $embedding) {
L\crawlTimeoutLog("Have inserted $total_insert many embeddings");
$embedding = base64_encode($embedding);
- $insert_sql .= "$comma($id, $item_type, '$embedding')";
- $delete_sql .= "$comma $id";
+ $insert_sql .= "$comma('$id', $item_type, '$embedding')";
+ $delete_sql .= "$comma '$id'";
$comma = ",";
$insert_count++;
$total_insert++;
diff --git a/src/models/ProfileModel.php b/src/models/ProfileModel.php
index 07f0466e2..bceef3883 100755
--- a/src/models/ProfileModel.php
+++ b/src/models/ProfileModel.php
@@ -335,7 +335,7 @@ class ProfileModel extends Model
32 * C\EMBEDDING_VECTOR_SIZE . "), PARENT_ID $integer,
PRIMARY KEY(ID, ITEM_TYPE))",
"RECOMMENDATION_TERM_EMBEDDING" => "CREATE TABLE
- RECOMMENDATION_TERM_EMBEDDING (ID $integer NOT NULL,
+ RECOMMENDATION_TERM_EMBEDDING (ID VARCHAR(16) NOT NULL,
ITEM_TYPE $integer NOT NULL, VECTOR VARCHAR(" .
32 * C\EMBEDDING_VECTOR_SIZE . "),
PRIMARY KEY(ID, ITEM_TYPE))",
diff --git a/tests/ManyUserExperiment.php b/tests/ManyUserExperiment.php
index 5fcb5b604..a9dd6819c 100644
--- a/tests/ManyUserExperiment.php
+++ b/tests/ManyUserExperiment.php
@@ -271,7 +271,7 @@ $web_series = [
"Jhansi.mp4",
"Dubai Bling.mp4",
"The Mysterious Benedict Society Season 2.mp4",
- "Sumo Do, Sumo Don't!.mp4",
+ "Sumo Do, Sumo Dont!.mp4",
"Star Wars Tales of the Jedi.mp4",
"Shadow Detective.mp4",
"Tripling Season 3.mp4",
@@ -322,7 +322,7 @@ $books = [
"Love on the Brain.pdf",
"Only a Monster.pdf",
"This Woven Kingdom.pdf",
- "Delilah Green Doesn't Care.pdf",
+ "Delilah Green Doesnt Care.pdf",
"Portrait of a Thief.pdf",
"The Paris Apartment.pdf",
"Foul Lady Fortune.pdf",