diff --git a/src/configs/Config.php b/src/configs/Config.php
index 306330d75..19280d973 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -162,7 +162,7 @@ nsconddefine('GENERATOR_STRING', "Yioop");
* Version number for upgrade database function
* @var int
*/
-nsdefine('DATABASE_VERSION', 75);
+nsdefine('DATABASE_VERSION', 76);
/**
* Minimum Version fo Yioop for which keyword ad script
* still works with this version
@@ -1264,6 +1264,8 @@ nsdefine('LONG_NAME_LEN', 64);
nsdefine('SHORT_TITLE_LEN', 128);
/** Length for names of things like titles of blog entries, etc */
nsdefine('TITLE_LEN', 512);
+/** Number of components in a term or item embedding */
+nsdefine('EMBEDDING_VECTOR_SIZE', 200);
/** Length of a feed item or post, etc */
nsdefine('MAX_GROUP_POST_LEN', 8192);
/** Length for for the contents of a wiki_page */
diff --git a/src/library/VersionFunctions.php b/src/library/VersionFunctions.php
index 763a33d73..dac1149f4 100644
--- a/src/library/VersionFunctions.php
+++ b/src/library/VersionFunctions.php
@@ -2072,3 +2072,23 @@ function upgradeDatabaseVersion75(&$db)
$db->execute("CREATE INDEX GR_RECOMMENDATION_INDEX ON " .
"GROUP_RESOURCE_RECOMMENDATION (USER_ID)");
}
+/**
+ * Upgrades a Version 76 version of the Yioop database to a Version 75 version
+ * @param object $db datasource to use to upgrade
+ */
+function upgradeDatabaseVersion76(&$db)
+{
+ $dbinfo = ["DBMS" => C\DBMS, "DB_HOST" => C\DB_HOST,
+ "DB_NAME" => C\DB_NAME, "DB_PASSWORD" => C\DB_PASSWORD];
+ $integer = $db->integerType($dbinfo);
+ $db->execute("DROP TABLE RECOMMENDATION_TERM_EMBEDDING");
+ $db->execute("DROP TABLE RECOMMENDATION_ITEM_EMBEDDING");
+ $db->execute("CREATE TABLE RECOMMENDATION_TERM_EMBEDDING " .
+ "(ID $integer NOT NULL, ITEM_TYPE $integer NOT NULL, VECTOR VARCHAR(" .
+ 32 * C\EMBEDDING_VECTOR_SIZE . "), " .
+ "PRIMARY KEY(ID, ITEM_TYPE))");
+ $db->execute("CREATE TABLE RECOMMENDATION_ITEM_EMBEDDING " .
+ "(ID $integer NOT NULL, ITEM_TYPE $integer NOT NULL, VECTOR VARCHAR(" .
+ 32 * C\EMBEDDING_VECTOR_SIZE . "), " .
+ "PARENT_ID $integer, PRIMARY KEY(ID, ITEM_TYPE))");
+}
diff --git a/src/library/media_jobs/RecommendationJob.php b/src/library/media_jobs/RecommendationJob.php
index 1a67be9ab..74b74139b 100644
--- a/src/library/media_jobs/RecommendationJob.php
+++ b/src/library/media_jobs/RecommendationJob.php
@@ -91,10 +91,6 @@ class RecommendationJob extends MediaJob
* Length of context window for calculating term embeddings
*/
const CONTEXT_WINDOW_LENGTH = 5;
- /**
- * Size of term and item embedding vector
- */
- const EMBEDDING_VECTOR_SIZE = 200;
/**
* Update period to consider for fetching the records from
* ITEM_IMPRESSION_SUMMARY table
@@ -133,7 +129,7 @@ class RecommendationJob extends MediaJob
$db_class = C\NS_DATASOURCES . ucfirst(C\DBMS). "Manager";
$this->db = new $db_class();
$this->db->connect();
- $this->size = self::EMBEDDING_VECTOR_SIZE;
+ $this->size = C\EMBEDDING_VECTOR_SIZE;
}
/**
* Only update if its been more than an hour since the last update
@@ -256,7 +252,7 @@ class RecommendationJob extends MediaJob
$insert_recommend_sql .=
"$comma ($user_id, {$recommendation['GROUP_ID']}, ".
"{$recommendation['PAGE_ID']}, " .
- "\'{$recommendation['RESOURCE_PATH']}\', ".
+ "'{$recommendation['RESOURCE_PATH']}', ".
"{$recommendation['TOTAL_SCORE']}, {$this->update_time}, ".
"{$recommendation['RESOURCE_ID']})";
$comma = ",";
@@ -333,6 +329,7 @@ class RecommendationJob extends MediaJob
$results = $db->execute($select_sql, [C\THREAD_RECOMMENDATION]);
$term_embeddings = [];
$item_terms = [];
+ L\crawlLog("Start Populating LRUCache of Embeddings...");
while ($row = $db->fetchArray($results)) {
if (is_string($row['VECTOR'])) {
$this->lru_cache->put($row['ID'],
@@ -341,6 +338,7 @@ class RecommendationJob extends MediaJob
var_dump($row['VECTOR']);
}
}
+ L\crawlLog("Finish Populating LRUCache of Embeddings");
$context_distance_sum = (self::CONTEXT_WINDOW_LENGTH *
(self::CONTEXT_WINDOW_LENGTH + 1)) / 2.0;
$mean = $context_distance_sum / self::CONTEXT_WINDOW_LENGTH;
@@ -366,7 +364,7 @@ class RecommendationJob extends MediaJob
for ($i = 0; $i < $num_terms; $i++) {
L\crawlTimeoutLog("Have processed $i of $num_terms terms");
[$term_id, $term] = $terms[$i];
- $term_hash = $term_id % self::EMBEDDING_VECTOR_SIZE + 1;
+ $term_hash = $term_id % C\EMBEDDING_VECTOR_SIZE + 1;
$term_sign_hash = hash(self::SIGN_HASH_ALGORITHM, $term, true);
$term_sign = unpack('N', $term_sign_hash)[1] % 2 == 0 ? -1 : 1;
$term_embedding = $this->getTermEmbedding($term_id,
@@ -381,7 +379,7 @@ class RecommendationJob extends MediaJob
$context_term_embedding);
$weight = exp(-1 * pow(($i - $j) / $std_deviation, 2));
$context_term_hash = $context_term_id %
- self::EMBEDDING_VECTOR_SIZE + 1;
+ C\EMBEDDING_VECTOR_SIZE + 1;
$context_term_sign_hash = hash(self::SIGN_HASH_ALGORITHM,
$context_term, true);
$context_term_sign = unpack('N', $context_term_sign_hash)[1]
@@ -418,7 +416,7 @@ class RecommendationJob extends MediaJob
$item_count = 0;
foreach ($item_terms as $item_id => [$terms, $group_id]) {
L\crawlTimeoutLog("Have done $item_count many group items");
- $item_embedding = array_fill(1, self::EMBEDDING_VECTOR_SIZE, 0);
+ $item_embedding = array_fill(1, C\EMBEDDING_VECTOR_SIZE, 0);
foreach ($terms as [$term_id, $term]) {
$term_embedding = $this->getTermEmbedding($term_id,
C\THREAD_RECOMMENDATION, true);
@@ -503,7 +501,7 @@ class RecommendationJob extends MediaJob
$item_ids = explode(",", $row['ITEM_IDS']);
$item_ids = array_unique($item_ids);
$item_user_embeddings[$user_id] = array_fill(1,
- self::EMBEDDING_VECTOR_SIZE, 0);
+ C\EMBEDDING_VECTOR_SIZE, 0);
$user_items[$user_id] = [];
foreach ($item_ids as $item_id) {
if (array_key_exists($item_id, $item_embeddings)) {
@@ -710,7 +708,7 @@ class RecommendationJob extends MediaJob
$group_ids = explode(",", $row['ITEM_IDS']);
$group_ids = array_unique($group_ids);
$group_user_embeddings[$user_id] = array_fill(1,
- self::EMBEDDING_VECTOR_SIZE, 0);
+ C\EMBEDDING_VECTOR_SIZE, 0);
$user_groups[$user_id] = [];
$group_count = 0;
foreach ($group_ids as $group_id) {
@@ -982,7 +980,7 @@ class RecommendationJob extends MediaJob
for ($i = 0; $i < $num_terms; $i++) {
L\crawlTimeoutLog("Have processed $i of $num_terms terms");
[$term_id, $term] = $terms[$i];
- $term_hash = $term_id % self::EMBEDDING_VECTOR_SIZE + 1;
+ $term_hash = $term_id % C\EMBEDDING_VECTOR_SIZE + 1;
$term_sign_hash = hash(self::SIGN_HASH_ALGORITHM,
$term, true);
$term_sign = unpack('N', $term_sign_hash)[1]
@@ -999,7 +997,7 @@ class RecommendationJob extends MediaJob
$context_term_embedding);
$weight = exp(-1 * pow(($i - $j) / $std_deviation, 2));
$context_term_hash = $context_term_id %
- self::EMBEDDING_VECTOR_SIZE + 1;
+ C\EMBEDDING_VECTOR_SIZE + 1;
$context_term_sign_hash = hash(
self::SIGN_HASH_ALGORITHM, $context_term, true);
$context_term_sign = unpack('N',
@@ -1079,7 +1077,7 @@ class RecommendationJob extends MediaJob
$resource_count = 0;
foreach ($resource_terms as $resource_id => $terms) {
L\crawlTimeoutLog("Have processed $resource_count many resources");
- $item_embedding = array_fill(1, self::EMBEDDING_VECTOR_SIZE, 0);
+ $item_embedding = array_fill(1, C\EMBEDDING_VECTOR_SIZE, 0);
foreach ($terms as [$term_id, $term]) {
$term_embedding = $this->getTermEmbedding($term_id,
C\RESOURCE_RECOMMENDATION, true);
@@ -1093,7 +1091,7 @@ class RecommendationJob extends MediaJob
}
foreach ($meta_details_terms as $resource_id => $meta_terms) {
if (!array_key_exists($resource_id, $updated_item_embeddings)) {
- $item_embedding = array_fill(1, self::EMBEDDING_VECTOR_SIZE, 0);
+ $item_embedding = array_fill(1, C\EMBEDDING_VECTOR_SIZE, 0);
} else {
$item_embedding = unpack("E*",
$updated_item_embeddings[$resource_id]);
@@ -1104,7 +1102,7 @@ class RecommendationJob extends MediaJob
}
$sign_hash = hash(self::SIGN_HASH_ALGORITHM, $meta_term, true);
$sign = unpack('N', $sign_hash)[1] % 2 == 0 ? -1 : 1;
- $item_embedding[$meta_term_id%self::EMBEDDING_VECTOR_SIZE + 1]
+ $item_embedding[$meta_term_id % C\EMBEDDING_VECTOR_SIZE + 1]
+= $sign * 1.0;
}
$updated_item_embeddings[$resource_id] = pack("E*",
@@ -1181,7 +1179,7 @@ class RecommendationJob extends MediaJob
$item_ids = explode(",", $row['ITEM_IDS']);
$item_ids = array_unique($item_ids);
$user_embeddings[$user_id] = array_fill(1,
- self::EMBEDDING_VECTOR_SIZE, 0);
+ C\EMBEDDING_VECTOR_SIZE, 0);
$user_items[$user_id] = [];
foreach ($item_ids as $item_id) {
if (array_key_exists($item_id, $item_embeddings)) {
@@ -1249,7 +1247,7 @@ class RecommendationJob extends MediaJob
$score, $item_id) = $recommendation;
$time = $this->update_time;
$insert_sql .= "$comma($user_id, $group_id, $page_id, " .
- "\'$resource_path\', $score, $time, $item_id)";
+ "'$resource_pathd', $score, $time, $item_id)";
$comma = ",";
$insert_count++;
$total_insert++;
@@ -1285,7 +1283,7 @@ class RecommendationJob extends MediaJob
$row = $db->fetchArray($result);
if (!$row || !is_string($row['VECTOR'])) {
$term_embedding = pack("E*", ...array_fill(1,
- self::EMBEDDING_VECTOR_SIZE, 0.0));
+ C\EMBEDDING_VECTOR_SIZE, 0.0));
} else {
$term_embedding = base64_decode($row['VECTOR'], true);
}
diff --git a/src/models/ProfileModel.php b/src/models/ProfileModel.php
index 55a320906..07f0466e2 100755
--- a/src/models/ProfileModel.php
+++ b/src/models/ProfileModel.php
@@ -121,7 +121,6 @@ class ProfileModel extends Model
$auto_increment = $dbm->autoIncrement($dbinfo);
$serial = $dbm->serialType($dbinfo);
$integer = $dbm->integerType($dbinfo);
- $blob = $dbm->blobType($dbinfo);
$page_type = $dbm->pageType($dbinfo);
$scraper_text = (stristr($dbinfo['DBMS'], "mysql") !== false) ?
" TEXT " : " VARCHAR(" . (10 * C\MAX_URL_LEN) . ") ";
@@ -332,11 +331,13 @@ class ProfileModel extends Model
PRIMARY KEY(ROLE_ID, ACTIVITY_ID))",
"RECOMMENDATION_ITEM_EMBEDDING" => "CREATE TABLE
RECOMMENDATION_ITEM_EMBEDDING (ID $integer NOT NULL,
- ITEM_TYPE $integer NOT NULL, VECTOR $blob, PARENT_ID $integer,
+ ITEM_TYPE $integer NOT NULL, VECTOR VARCHAR(" .
+ 32 * C\EMBEDDING_VECTOR_SIZE . "), PARENT_ID $integer,
PRIMARY KEY(ID, ITEM_TYPE))",
"RECOMMENDATION_TERM_EMBEDDING" => "CREATE TABLE
RECOMMENDATION_TERM_EMBEDDING (ID $integer NOT NULL,
- ITEM_TYPE $integer NOT NULL, VECTOR $blob,
+ ITEM_TYPE $integer NOT NULL, VECTOR VARCHAR(" .
+ 32 * C\EMBEDDING_VECTOR_SIZE . "),
PRIMARY KEY(ID, ITEM_TYPE))",
"SCRAPER" =>
"CREATE TABLE SCRAPER (ID $serial PRIMARY KEY