diff --git a/src/configs/Config.php b/src/configs/Config.php index e3602044d..b3d784ca8 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -162,7 +162,7 @@ nsconddefine('GENERATOR_STRING', "Yioop"); * Version number for upgrade database function * @var int */ -nsdefine('DATABASE_VERSION', 73); +nsdefine('DATABASE_VERSION', 74); /** * Minimum Version fo Yioop for which keyword ad script * still works with this version @@ -1223,17 +1223,21 @@ nsdefine('RESOURCE_IMPRESSION', 6); */ nsdefine('MAX_RECOMMENDATIONS', 3); /** - * Type used to indicate ITEM_RECOMMENDATION score is about a trending thread + * Type used to indicate ITEM_TYPE is about a trending thread */ nsdefine('TRENDING_RECOMMENDATION', 1); /** - * Type used to indicate ITEM_RECOMMENDATION score is about a thread + * Type used to indicate ITEM_TYPE is about a thread */ nsdefine('THREAD_RECOMMENDATION', 2); /** - * Type used to indicate ITEM_RECOMMENDATION score is about a group + * Type used to indicate ITEM_TYPE is about a group */ nsdefine('GROUP_RECOMMENDATION', 3); +/** + * Type used to indicate ITEM_TYPE is about a wiki resource + */ +nsdefine('RESOURCE_RECOMMENDATION', 4); /** * Used to control update frequency of impression analytic data when * media updater in use diff --git a/src/controllers/components/AccountaccessComponent.php b/src/controllers/components/AccountaccessComponent.php index e7c6ebffc..d19834b7e 100644 --- a/src/controllers/components/AccountaccessComponent.php +++ b/src/controllers/components/AccountaccessComponent.php @@ -87,6 +87,8 @@ class AccountaccessComponent extends Component $data['GROUP_RECOMMENDATIONS'] = $user_model->getRecommendations($cron_timestamp, $user_id, C\GROUP_RECOMMENDATION); + $data['RESOURCE_RECOMMENDATIONS'] = + $user_model->getResourceRecommendations($user_id); $username = $signin_model->getUserName($user_id); $data["USER"] = $user_model->getUser($username); $data["CRAWL_MANAGER"] = false; diff --git a/src/controllers/components/SocialComponent.php b/src/controllers/components/SocialComponent.php index 073d27670..8c14a4986 100644 --- a/src/controllers/components/SocialComponent.php +++ b/src/controllers/components/SocialComponent.php @@ -62,6 +62,11 @@ class SocialComponent extends Component implements CrawlConstants * successfully uploaded */ const UPLOAD_SUCCESS = 1; + /** + * File to tell RecommendationJob the paths of eligible wiki resources + * description files + */ + const RECOMMENDATION_FILE = C\APP_DIR . "/resources/recommendation.txt"; /** * Used to handle the manage group activity. * @@ -3917,6 +3922,12 @@ EOD; needs_descriptions_format: $data["HEAD"]['update_description']); $thumb_folder = $data['RESOURCES_INFO']['thumb_folder']; + if (!empty($thumb_folder)) { + $fp = fopen(self::RECOMMENDATION_FILE, "a"); + fwrite($fp, $group_id . "###" . $data['PAGE_ID'] . "###" . + $thumb_folder . "\n"); + fclose($fp); + } $this->initUserResourcePreferences($data); $scroll_id = "scroll-container-" . L\crawlHash($data['PAGE_ID'] . $sub_path); diff --git a/src/library/LinearAlgebra.php b/src/library/LinearAlgebra.php index 31d620882..6aafddb33 100644 --- a/src/library/LinearAlgebra.php +++ b/src/library/LinearAlgebra.php @@ -56,7 +56,7 @@ class LinearAlgebra public static function add($vector1, $vector2) { if (is_array($vector1) && is_array($vector2)) { - foreach($vector2 as $coord2 => $value2) { + foreach ($vector2 as $coord2 => $value2) { $vector1[$coord2] = (empty($vector1[$coord2])) ? $value2 : $vector1[$coord2] + $value2; } @@ -68,7 +68,7 @@ class LinearAlgebra $scalar = $vector1; $vector1 = $vector2; } - foreach($vector1 as $coord => $value) { + foreach ($vector1 as $coord => $value) { $vector1[$coord] = $value + $scalar; } } @@ -101,7 +101,7 @@ class LinearAlgebra $sum = 0; $not_in_common = 0; $distortion = 0; - foreach($vector1 as $key => $weight) { + foreach ($vector1 as $key => $weight) { if (empty($vector2[$key])) { $sum += $weight * $weight; $not_in_common++; @@ -110,7 +110,7 @@ class LinearAlgebra $sum += $diff * $diff; } } - foreach($vector2 as $key => $weight) { + foreach ($vector2 as $key => $weight) { if (empty($vector1[$key])) { $sum += $weight * $weight; $not_in_common++; @@ -134,7 +134,7 @@ class LinearAlgebra $v1 = (count($vector1) < count($vector2)) ? $vector1 : $vector2; $v2 = (count($vector1) < count($vector2)) ? $vector2 : $vector1; $sum = 0.; - foreach($v1 as $coordinate => $value) { + foreach ($v1 as $coordinate => $value) { if (!empty($v2[$coordinate])) { $sum += $value * $v2[$coordinate]; } @@ -168,7 +168,7 @@ class LinearAlgebra public static function length($vector, $norm_power = 2) { $norm = 0.; - foreach($vector as $weight) { + foreach ($vector as $weight) { $norm += pow(abs($weight), $norm_power); } $norm = pow($norm, 1./$norm_power); @@ -185,7 +185,7 @@ class LinearAlgebra public static function multiply($scalar_vec_mat, $vector) { if (is_numeric($scalar_vec_mat)) { - foreach($vector as $coordinate => $value) { + foreach ($vector as $coordinate => $value) { $vector[$coordinate] *= $scalar_vec_mat; } return $vector; @@ -238,9 +238,12 @@ class LinearAlgebra */ public static function similarity($vector1, $vector2) { - $similarity = self::dot($vector1, $vector2) / - (self::length($vector1) * self::length($vector2)); - return $similarity; + $dot_product = self::dot($vector1, $vector2); + $length = self::length($vector1) * self::length($vector2); + if ($length == 0) { + return 0.; + } + return $dot_product / $length; } /** * Subtracts two vectors component-wise. Treat empty components in either @@ -256,7 +259,7 @@ class LinearAlgebra public static function subtract($vector1, $vector2) { if (is_array($vector1) && is_array($vector2)) { - foreach($vector2 as $coord2 => $value2) { + foreach ($vector2 as $coord2 => $value2) { $vector1[$coord2] = (empty($vector1[$coord2])) ? -$value2 : $vector1[$coord2] - $value2; } @@ -268,10 +271,10 @@ class LinearAlgebra $scalar = $vector1; $vector1 = $vector2; } - foreach($vector1 as $coord => $value) { + foreach ($vector1 as $coord => $value) { $vector1[$coord] = $value - $scalar; } } return $vector1; } -} +} \ No newline at end of file diff --git a/src/library/VersionFunctions.php b/src/library/VersionFunctions.php index cf7f2c0d1..9a09191e9 100644 --- a/src/library/VersionFunctions.php +++ b/src/library/VersionFunctions.php @@ -2028,3 +2028,33 @@ function upgradeDatabaseVersion73(&$db) $db->execute("ALTER TABLE GROUP_PAGE ADD COLUMN LAST_MODIFIED NUMERIC(".C\TIMESTAMP_LEN.")"); } +/** + * Upgrades a Version 73 version of the Yioop database to a Version 74 version + * @param object $db datasource to use to upgrade + */ +function upgradeDatabaseVersion74(&$db) +{ + $dbinfo = ["DBMS" => C\DBMS, "DB_HOST" => C\DB_HOST, + "DB_NAME" => C\DB_NAME, "DB_PASSWORD" => C\DB_PASSWORD]; + $integer = $db->integerType($dbinfo); + $db->execute("DROP TABLE IF EXISTS ITEM_RECOMMENDATION"); + $db->execute("DROP TABLE IF EXISTS ITEM_TERM_FREQUENCY"); + $db->execute("DROP TABLE IF EXISTS ITEM_TERM_WEIGHTS"); + $db->execute("DROP TABLE IF EXISTS USER_ITEM_SIMILARITY"); + $db->execute("DROP TABLE IF EXISTS USER_TERM_FREQUENCY"); + $db->execute("DROP TABLE IF EXISTS USER_TERM_WEIGHTS"); + $db->execute("CREATE TABLE IF NOT EXISTS RECOMMENDATION_TERM_EMBEDDING " . + "(ID $integer NOT NULL, ITEM_TYPE $integer NOT NULL, VECTOR BLOB, " . + "PRIMARY KEY(ID, ITEM_TYPE))"); + $db->execute("CREATE TABLE IF NOT EXISTS RECOMMENDATION_ITEM_EMBEDDING " . + "(ID $integer NOT NULL, ITEM_TYPE $integer NOT NULL, VECTOR BLOB, " . + "PARENT_ID $integer, PRIMARY KEY(ID, ITEM_TYPE))"); + $db->execute("CREATE TABLE IF NOT EXISTS GROUP_ITEM_RECOMMENDATION " . + "(USER_ID $integer NOT NULL, ITEM_ID $integer NOT NULL, " . + "ITEM_TYPE $integer NOT NULL, SCORE FLOAT, TIMESTAMP " . + "NUMERIC(" . C\TIMESTAMP_LEN . "))"); + $db->execute("CREATE TABLE IF NOT EXISTS GROUP_RESOURCE_RECOMMENDATION " . + "(USER_ID $integer NOT NULL, GROUP_ID $integer NOT NULL, " . + "PAGE_ID $integer NOT NULL, RESOURCE_PATH VARCHAR(255), SCORE FLOAT, " . + "TIMESTAMP NUMERIC(" . C\TIMESTAMP_LEN . "), RESOURCE_ID $integer)"); +} diff --git a/src/library/media_jobs/RecommendationJob.php b/src/library/media_jobs/RecommendationJob.php index d451c610b..f18332dac 100644 --- a/src/library/media_jobs/RecommendationJob.php +++ b/src/library/media_jobs/RecommendationJob.php @@ -34,7 +34,7 @@ namespace seekquarry\yioop\library\media_jobs; use seekquarry\yioop\configs as C; use seekquarry\yioop\library as L; use seekquarry\yioop\library\LinearAlgebra as LinearAlgebra; -use seekquarry\yioop\library\CrawlConstants; +use seekquarry\yioop\library\PhraseParser as PhraseParser; use seekquarry\yioop\models\CronModel; /** @@ -77,6 +77,31 @@ class RecommendationJob extends MediaJob * Maximum number of terms used in making recommendations */ const MAX_TERMS = 20000; + /** + * File containing paths to description folders of wiki page resources + * that should be used to create data corpus for computing recommendations + */ + const RECOMMENDATION_FILE = C\APP_DIR . "/resources/recommendation.txt"; + /** + * Length of context window for calculating term embeddings + */ + const CONTEXT_WINDOW_LENGTH = 5; + /** + * Size of term and item embedding vector + */ + const EMBEDDING_VECTOR_SIZE = 200; + /** + * Update period to consider for fetching the records from + * ITEM_IMPRESSION_SUMMARY table + */ + const UPDATE_PERIOD = C\ONE_MONTH; + /** + * Stop words to exclude from the descriptions fetched by DescriptionUpdate + * media job + */ + const DESCRIPTION_STOP_WORDS = ["author", "authors", "plot", "genre", + "genres", "star", "stars", "credits", "rating", "ratings", "year", + "director", "cast", "runtime"]; /** * Sets up the database connection so can access tables related * to recommendations. Initialize timing info related to job. @@ -121,15 +146,19 @@ class RecommendationJob extends MediaJob "item_group_recommendations"); L\crawlLog("Current Active Recommendation Timestamp: ". $this->active_time); - L\crawlLog("...Clearing last run's intermediate results together ". - "with any old data"); - $this->clearIntermediateRecommendationData(); L\crawlLog("...Start computing similarity-based group and item ". "recommendations..."); $this->computeThreadGroupRecommendations(); L\crawlLog("...Finished computing similarity-based group and item ". "recommendations."); + L\crawlLog("...Start computing similarity-based wiki resource " . + "recommendations..."); + $this->computeWikiResourceRecommendations(); + L\crawlLog("...Finished computing similarity-based wiki" . + "resource recommendations..."); + L\crawlLog("...Start computing new user recommendations..."); $this->initializeNewUserRecommendations(); + L\crawlLog("...Finished computing new user recommendations..."); $this->cron_model->updateCronTime( "item_group_recommendations", $this->update_time); } @@ -144,8 +173,8 @@ class RecommendationJob extends MediaJob $popular_recommendations = [ C\THREAD_RECOMMENDATION => [], C\GROUP_RECOMMENDATION => []]; $sql = "SELECT ITEM_ID, SUM(SCORE) AS TOTAL_SCORE FROM " . - "ITEM_RECOMMENDATION WHERE ITEM_TYPE = ? AND TIMESTAMP = " . - $this->active_time . " GROUP BY ITEM_ID ORDER BY TOTAL_SCORE DESC ". + "GROUP_ITEM_RECOMMENDATION WHERE ITEM_TYPE = ? " . + "GROUP BY ITEM_ID ORDER BY TOTAL_SCORE DESC ". $db->limitOffset(C\MAX_RECOMMENDATIONS); foreach ($popular_recommendations as $type => $recommendation) { $results = $db->execute($sql, [$type]); @@ -155,9 +184,9 @@ class RecommendationJob extends MediaJob } $new_user_sql = "SELECT USER_ID AS USER_ID ". "FROM USERS WHERE USER_ID NOT IN ". - "(SELECT USER_ID FROM ITEM_RECOMMENDATION)"; + "(SELECT USER_ID FROM GROUP_ITEM_RECOMMENDATION)"; $new_user_results = $db->execute($new_user_sql); - $base_recommend_sql = "INSERT INTO ITEM_RECOMMENDATION VALUES "; + $base_recommend_sql = "INSERT INTO GROUP_ITEM_RECOMMENDATION VALUES "; $insert_recommend_sql = $base_recommend_sql; $comma = ""; $insert_count = 0; @@ -167,13 +196,11 @@ class RecommendationJob extends MediaJob foreach ($popular_recommendations as $type => $recommendations) { foreach ($recommendations as $recommendation) { $insert_recommend_sql .= - "$comma ({$recommendation['ITEM_ID']}, $user_id, ". + "$comma ($user_id, {$recommendation['ITEM_ID']}, ". "$type, {$recommendation['TOTAL_SCORE']}," . $this->update_time . ")"; $comma = ","; $insert_count++; - L\crawlTimeoutLog("..initialized new %s users so far", - $i++); } if ($insert_count > self::BATCH_SQL_INSERT_NUM) { $db->execute($insert_recommend_sql); @@ -186,6 +213,46 @@ class RecommendationJob extends MediaJob if ($insert_count > 0) { $db->execute($insert_recommend_sql); } + $sql = "SELECT GROUP_ID, PAGE_ID, RESOURCE_PATH, RESOURCE_ID," . + " SUM(SCORE) AS TOTAL_SCORE FROM" . + " GROUP_RESOURCE_RECOMMENDATION GROUP BY GROUP_ID," . + " PAGE_ID, RESOURCE_PATH, RESOURCE_ID ORDER BY TOTAL_SCORE DESC"; + $results = $db->execute($sql); + while ($row = $db->fetchArray($results)) { + $popular_recommendations[C\RESOURCE_RECOMMENDATION][] = $row; + } + $base_recommend_sql = "INSERT INTO GROUP_RESOURCE_RECOMMENDATION" . + " VALUES "; + $insert_recommend_sql = $base_recommend_sql; + $comma = ""; + $insert_count = 0; + $new_user_sql = "SELECT USER_ID FROM USERS WHERE USER_ID NOT IN" . + "(SELECT USER_ID FROM GROUP_RESOURCE_RECOMMENDATION)"; + $new_user_results = $db->execute($new_user_sql); + while ($row = $db->fetchArray($new_user_results)) { + $user_id = $row['USER_ID']; + $timestamp = time(); + foreach ($popular_recommendations[C\RESOURCE_RECOMMENDATION] as + $recommendation) { + $insert_recommend_sql .= + "$comma ($user_id, {$recommendation['GROUP_ID']}, ". + "{$recommendation['PAGE_ID']}, " . + "\"{$recommendation['RESOURCE_PATH']}\", ". + "{$recommendation['TOTAL_SCORE']}, {$this->update_time}, ". + "{$recommendation['RESOURCE_ID']})"; + $comma = ","; + $insert_count++; + if ($insert_count > self::BATCH_SQL_INSERT_NUM) { + $db->execute($insert_recommend_sql); + $insert_recommend_sql = $base_recommend_sql; + $insert_count = 0; + $comma = ""; + } + } + } + if ($insert_count > 0) { + $db->execute($insert_recommend_sql); + } } /** * Manages the whole process of computing thread and group recommendations @@ -194,429 +261,932 @@ class RecommendationJob extends MediaJob */ public function computeThreadGroupRecommendations() { - $this->computeItemTermFrequencies(); - $this->computeUserTermFrequencies(); - $number_items = $this->numberItems(); - $number_users = $this->numberUsers(); - $this->computeUserItemIdf($number_items, $number_users); - $this->tfIdfUsers(); - $this->tfIdfItems(); - $this->computeUserItemSimilarity(); - $not_belongs_subselect = "NOT EXISTS (SELECT * FROM ". - "GROUP_ITEM B WHERE S.USER_ID=B.USER_ID ". - "AND S.THREAD_ID=B.PARENT_ID )"; - $this->calculateSimilarityRecommendations(C\THREAD_RECOMMENDATION, - "SELECT S.USER_ID, S.THREAD_ID, S.SIMILARITY FROM ". - "USER_ITEM_SIMILARITY S WHERE $not_belongs_subselect AND ". - "S.GROUP_MEMBER=1 ORDER BY S.USER_ID ASC, ". - "S.SIMILARITY DESC", C\MAX_RECOMMENDATIONS); - $this->calculateSimilarityRecommendations(C\GROUP_RECOMMENDATION, - "SELECT S.USER_ID AS USER_ID, GI.GROUP_ID AS GROUP_ID," . - "SUM(S.SIMILARITY) AS RATING FROM ". - "GROUP_ITEM GI, USER_ITEM_SIMILARITY S ". - "WHERE GI.ID = S.THREAD_ID AND S.GROUP_MEMBER=0 ". - "GROUP BY GI.GROUP_ID, S.USER_ID ORDER BY S.USER_ID, RATING DESC", - C\MAX_RECOMMENDATIONS); + L\crawlLog("...Start computing Item Term Embeddings..."); + [$term_embeddings, $item_terms] = $this->computeItemTermEmbeddings(); + L\crawlLog("...Finished computing Item Term Embeddings..."); + L\crawlLog("...Start computing Item Embeddings..."); + $item_embeddings = $this->computeItemEmbeddings( + $term_embeddings, $item_terms); + L\crawlLog("...Finished computing Item Embeddings..."); + L\crawlLog("...Start computing Item User Embeddings..."); + [$item_user_embeddings, $user_items] = $this-> + computeItemUserEmbeddings($item_embeddings); + L\crawlLog("...Finshed computing Item User Embeddings..."); + L\crawlLog("...Start computing Item User Recommendations..."); + $user_groups = $this->computeItemUserRecommendations($item_embeddings, + $item_user_embeddings, $user_items); + L\crawlLog("...Finished computing Item User Recommendations..."); + L\crawlLog("...Start computing Group Embeddings..."); + $group_embeddings = $this->computeGroupEmbeddings($item_embeddings); + L\crawlLog("...Finished computing Group Embeddings..."); + L\crawlLog("...Start computing Group User Embeddings..."); + [$group_user_embeddings, $user_group_impression] = + $this->computeGroupUserEmbeddings($group_embeddings); + L\crawlLog("...Finished computing Group User Embeddings..."); + L\crawlLog("...Start computing Group User Recommendations..."); + $this->computeGroupUserRecommendations($group_embeddings, + $group_user_embeddings, $user_groups, $user_group_impression); + L\crawlLog("...Finished computing Group User Recommendations..."); } /** - * Delete all rows from intermediate tables used in the calculation - * of group and thread recommendations. Also clears any non-active item - * recommendations + * Computes the term embeddings for individual items (main thread only and + * not comments) in groups feeds for the terms in their title and + * description text. Processes only MAX_GROUP_ITEMS which are either newly + * created or recently edited + * + * @return array [$term_embeddings, $item_terms] containing embeddings for + * terms in the items and terms in each item */ - public function clearIntermediateRecommendationData() + public function computeItemTermEmbeddings() { - $this->db->execute("DELETE FROM ITEM_RECOMMENDATION - WHERE TIMESTAMP <> '" . $this->active_time . "'"); + $db = $this->db; + $select_sql = "SELECT * FROM RECOMMENDATION_TERM_EMBEDDING WHERE" . + " ITEM_TYPE = ?"; + $results = $db->execute($select_sql, [C\THREAD_RECOMMENDATION]); + $term_embeddings = []; + $item_terms = []; + while ($row = $db->fetchArray($results)) { + $term_embeddings[$row['ID']] = unserialize($row['VECTOR']); + } + $context_distance_sum = (self::CONTEXT_WINDOW_LENGTH * + (self::CONTEXT_WINDOW_LENGTH + 1)) / 2.0; + $mean = $context_distance_sum / self::CONTEXT_WINDOW_LENGTH; + $carry = 0.0; + for ($i = 1; $i <= self::CONTEXT_WINDOW_LENGTH; $i++) { + $difference = $i - $mean; + $carry += $difference * $difference; + } + $std_deviation = sqrt($carry / self::CONTEXT_WINDOW_LENGTH); + $group_item_sql = "SELECT * FROM GROUP_ITEM WHERE ID = PARENT_ID" . + " AND TITLE NOT LIKE '%Page%' ORDER BY EDIT_DATE DESC " . + $db->limitOffset(self::MAX_GROUP_ITEMS); + $results = $db->execute($group_item_sql); + $update_term_embeddings = []; + while ($row = $db->fetchArray($results)) { + $item_id = $row['ID']; + $text_corpus = $row['TITLE'] . " " . $row['DESCRIPTION']; + $text_corpus = mb_strtolower($text_corpus); + $terms = $this->cleanRemoveStopWords($text_corpus); + $item_terms[$item_id] = [$terms, $row['GROUP_ID']]; + for ($i = 0; $i < count($terms); $i++) { + [$term_id, $term] = $terms[$i]; + $term_hash = $term_id % self::EMBEDDING_VECTOR_SIZE; + if (!array_key_exists($term_id, $term_embeddings)) { + $term_embeddings[$term_id] = array_fill(0, + self::EMBEDDING_VECTOR_SIZE, 0); + } + for ($j = $i - 1; $j >= 0 && + $j >= $i - self::CONTEXT_WINDOW_LENGTH; $j--) { + [$context_term_id, $context_term] = $terms[$j]; + $weight = exp(-1 * pow(($i - $j) / $std_deviation, 2)); + $context_term_hash = $context_term_id % + self::EMBEDDING_VECTOR_SIZE; + $term_embeddings[$term_id][$context_term_hash] += + $weight; + $term_embeddings[$context_term_id][$term_hash] += + $weight; + } + } + } + $normalized_term_embeddings = []; + foreach ($term_embeddings as $term_id => $embedding) { + $normalized_term_embeddings[$term_id] = + LinearAlgebra::normalize($embedding); + } + $delete_sql = "DELETE FROM RECOMMENDATION_TERM_EMBEDDING" . + " WHERE ITEM_TYPE = ?"; + $db->execute($delete_sql, [C\THREAD_RECOMMENDATION]); + $base_insert_sql = "INSERT INTO RECOMMENDATION_TERM_EMBEDDING VALUES "; + $insert_sql = $base_insert_sql; + $comma = ""; + $insert_count = 0; + $item_type = C\THREAD_RECOMMENDATION; + foreach ($normalized_term_embeddings as $term_id => $embedding) { + $serialized_embedding = serialize($embedding); + $insert_sql .= "$comma($term_id, $item_type," . + " '$serialized_embedding')"; + $comma = ","; + $insert_count++; + if ($insert_count == self::BATCH_SQL_INSERT_NUM) { + $insert_sql = $db->insertIgnore($insert_sql); + $db->execute($insert_sql); + $insert_count = 0; + $comma = ""; + $insert_sql = $base_insert_sql; + } + } + if ($insert_count > 0) { + $insert_sql = $db->insertIgnore($insert_sql); + $db->execute($insert_sql); + } + return [$term_embeddings, $item_terms]; } /** - * Computes the number of group items - * @return int number of items + * Computes the item embeddings for individual items (main thread only and + * not comments) in groups feeds using the term embeddings for their terms. + * Additionally fetches the existing item embeddings from database and + * updates them if the term embeddings are updated for their terms + * + * @param array $term_embeddings embedding for the terms + * @param array $item_terms terms in each item + * @return array $updated_item_embeddings containing embeddings for items */ - public function numberItems() + public function computeItemEmbeddings($term_embeddings, $item_terms) { - $results = $this->db->execute("SELECT COUNT(*) AS NUM_ITEMS FROM ". - "GROUP_ITEM WHERE LOWER(TITLE) NOT LIKE '%page%'"); - $num_items = 0; - if ($row = $this->db->fetchArray($results)) { - $num_items = $row['NUM_ITEMS']; + $db = $this->db; + $sql = "SELECT * FROM RECOMMENDATION_ITEM_EMBEDDING" . + " WHERE ITEM_TYPE = ?"; + $results = $db->execute($sql, [C\THREAD_RECOMMENDATION]); + $item_embeddings = []; + while ($row = $db->fetchArray($results)) { + $item_embeddings[$row['ID']] = [unserialize($row['VECTOR']), + $row['PARENT_ID']]; + } + $updated_item_embeddings = []; + foreach ($item_terms as $item_id => [$terms, $group_id]) { + $item_embedding = array_fill(0, self::EMBEDDING_VECTOR_SIZE, 0); + foreach ($terms as [$term_id, $term]) { + if (array_key_exists($term_id, $term_embeddings)) { + $item_embedding = LinearAlgebra::add($item_embedding, + $term_embeddings[$term_id]); + } + } + $updated_item_embeddings[$item_id] = [$item_embedding, $group_id]; + if (array_key_exists($item_id, $item_embeddings)) { + unset($item_embeddings[$item_id]); + } + } + foreach ($item_embeddings as $item_id => [$embedding, $parent_id]) { + $updated_item_embeddings[$item_id] = [$embedding, $parent_id]; + } + foreach ($updated_item_embeddings as $item_id => $embedding) { + $updated_item_embeddings[$item_id][0] = LinearAlgebra::normalize( + $updated_item_embeddings[$item_id][0]); } - return $num_items; + $delete_sql = "DELETE FROM RECOMMENDATION_ITEM_EMBEDDING" . + " WHERE ITEM_TYPE = ?"; + $db->execute($delete_sql, [C\THREAD_RECOMMENDATION]); + $base_insert_sql = "INSERT INTO RECOMMENDATION_ITEM_EMBEDDING VALUES "; + $insert_sql = $base_insert_sql; + $comma = ""; + $insert_count = 0; + $item_type = C\THREAD_RECOMMENDATION; + foreach ($updated_item_embeddings as + $item_id => [$embedding, $parent_id]) { + $serialized_embedding = serialize($embedding); + $insert_sql .= "$comma($item_id, $item_type," . + " '$serialized_embedding', $parent_id)"; + $comma = ","; + $insert_count++; + if ($insert_count == self::BATCH_SQL_INSERT_NUM) { + $insert_sql = $db->insertIgnore($insert_sql); + $db->execute($insert_sql); + $insert_count = 0; + $comma = ""; + $insert_sql = $base_insert_sql; + } + } + if ($insert_count > 0) { + $insert_sql = $db->insertIgnore($insert_sql); + $db->execute($insert_sql); + } + return $updated_item_embeddings; } /** - * Computes the number of users - * @return int number of users + * Computes the user embeddings based on the item embeddings which user have + * impression in ITEM_IMPRESSION_SUMMARY table for defined UPDATE_PERIOD + * + * @param array $item_embeddings embedding vectors of items + * @return array [$item_user_embedding, $user_items] user embeddings for + * items and the items id user have impression */ - public function numberUsers() + public function computeItemUserEmbeddings($item_embeddings) { - $results = - $this->db->execute("SELECT COUNT(*) AS NUM_USERS FROM USERS"); - $num_users = 0; - if ($row = $this->db->fetchArray($results)) { - $num_users = $row['NUM_USERS']; + $db = $this->db; + $db_list_function = in_array($db->to_upper_dbms, ["SQLITE3", "MYSQL"]) ? + "GROUP_CONCAT" : "STRING_AGG"; + $timestamp = floor(time() / self::UPDATE_PERIOD ) * self::UPDATE_PERIOD; + $condition = "ITEM_TYPE = ? AND USER_ID <> 2 AND" . + " ((UPDATE_PERIOD = ? AND UPDATE_TIMESTAMP = ?) OR" . + " (UPDATE_PERIOD = ?))"; + $impression_sql = "SELECT USER_ID, $db_list_function(ITEM_ID, ',') AS" . + " ITEM_IDS FROM ITEM_IMPRESSION_SUMMARY WHERE $condition" . + " GROUP BY USER_ID"; + $results = $db->execute($impression_sql, + [C\THREAD_IMPRESSION, self::UPDATE_PERIOD, $timestamp, + C\MOST_RECENT_VIEW]); + $item_user_embeddings = []; + $user_items = []; + while ($row = $db->fetchArray($results)) { + $user_id = $row['USER_ID']; + $item_ids = explode(",", $row['ITEM_IDS']); + $item_ids = array_unique($item_ids); + $item_user_embeddings[$user_id] = array_fill(0, + self::EMBEDDING_VECTOR_SIZE, 0); + $user_items[$user_id] = []; + foreach ($item_ids as $item_id) { + if (array_key_exists($item_id, $item_embeddings)) { + $item_user_embeddings[$user_id] = LinearAlgebra::add( + $item_user_embeddings[$user_id], + $item_embeddings[$item_id][0]); + $user_items[$user_id][] = $item_id; + } + } + $item_user_embeddings[$user_id] = LinearAlgebra::normalize( + $item_user_embeddings[$user_id]); } - return $num_users; + return [$item_user_embeddings, $user_items]; } /** - * Computes the term frequencies for individual items (posts) in groups - * feeds. That is, for each item in each group for each term in that - * item compute the number of times it appears in that item. + * Computes the items recommendation for user based on the cosine similarity + * between user embeddings and item embeddings. Recommendations are + * calculated for the items user have not interacted with yet and items + * should be from the groups where the user is already a memeber + * + * @param array $item_embeddings embeddings vectors for items + * @param array $item_user_embeddings embeddings vectors for user + * @param array $user_items items id for user in impression table + * @return array $user_groups group ids where the user is a member */ - public function computeItemTermFrequencies() + public function computeItemUserRecommendations($item_embeddings, + $item_user_embeddings, $user_items) { + L\crawlLog("...Computing User Item Similarity Scores."); $db = $this->db; - $group_item_sql = "SELECT ID AS ITEM_ID, TITLE, DESCRIPTION ". - "FROM GROUP_ITEM ". - "WHERE LOWER(TITLE) NOT LIKE '%page%' " . - "ORDER BY PUBDATE DESC " . $db->limitOffset(self::MAX_GROUP_ITEMS); - $results = $db->execute($group_item_sql); - $base_sql = "INSERT INTO ITEM_TERM_FREQUENCY VALUES"; - $insert_sql = $base_sql; + $db_list_function = in_array($db->to_upper_dbms, ["SQLITE3", "MYSQL"]) ? + "GROUP_CONCAT" : "STRING_AGG"; + $user_group_sql = "SELECT USER_ID, $db_list_function(GROUP_ID, ',')" . + " AS GROUP_IDS FROM USER_GROUP GROUP BY USER_ID"; + $results = $db->execute($user_group_sql); + $user_groups = []; + while ($row = $db->fetchArray($results)) { + $user_id = $row['USER_ID']; + $group_ids = explode(",", $row['GROUP_IDS']); + $user_groups[$user_id] = $group_ids; + } + $item_user_recommendations = []; + foreach ($item_user_embeddings as $user_id => $embedding) { + if (array_key_exists($user_id, $user_groups)) { + $user_item = []; + if (array_key_exists($user_id, $user_items)) { + $user_item = $user_items[$user_id]; + } + $user_group = []; + if (array_key_exists($user_id, $user_groups)) { + $user_group = $user_groups[$user_id]; + } + foreach ($item_embeddings as + $item_id => [$item_embedding, $parent_id]) { + if (in_array($item_id, $user_item) || + !in_array($parent_id, $user_group)) { + continue; + } + $similarity = LinearAlgebra::similarity( + $item_embedding, $embedding); + $item_user_recommendations[] = [$user_id, + $item_id, $similarity]; + } + } + } + $delete_sql = "DELETE FROM GROUP_ITEM_RECOMMENDATION WHERE" . + " ITEM_TYPE = ?"; + $db->execute($delete_sql, [C\THREAD_RECOMMENDATION]); + $base_insert_sql = "INSERT INTO GROUP_ITEM_RECOMMENDATION VALUES "; + $insert_sql = $base_insert_sql; $comma = ""; $insert_count = 0; - L\crawlLog("...Computing Item Term Frequencies"); - $i = 0; - while ($item = $db->fetchArray($results)) { - $term_frequencies = $this->termCount( - $item['TITLE'] . " " . $item['DESCRIPTION']); - foreach ($term_frequencies as $term => $frequency) { - $log_freq = log($frequency, 10) + 1; - $insert_sql .= "$comma ({$item['ITEM_ID']}, '" . - floor(bindec(str_replace(" ", "", L\toBinString( - hash("crc32b", $term, true))))/2) . - "', $frequency, $log_freq)"; - $comma = ","; - $insert_count++; - L\crawlTimeoutLog("...%s item term frequencies so far", - $i++); - if ($insert_count > self::BATCH_SQL_INSERT_NUM) { - $insert_ignore_sql = $db->insertIgnore($insert_sql); - $db->execute($insert_ignore_sql); - $insert_sql = $base_sql; - $insert_count = 0; - $comma = ""; - } + $item_type = C\THREAD_RECOMMENDATION; + foreach ($item_user_recommendations as $recommendation) { + $insert_sql .= "$comma({$recommendation[0]}, {$recommendation[1]}" . + ", $item_type, {$recommendation[2]}, {$this->update_time})"; + $comma = ","; + $insert_count++; + if ($insert_count == self::BATCH_SQL_INSERT_NUM) { + $insert_sql = $db->insertIgnore($insert_sql); + $db->execute($insert_sql); + $insert_count = 0; + $comma = ""; + $insert_sql = $base_insert_sql; } } if ($insert_count > 0) { - $insert_ignore_sql = $db->insertIgnore($insert_sql); - $db->execute($insert_ignore_sql); + $insert_sql = $db->insertIgnore($insert_sql); + $db->execute($insert_sql); } + return $user_groups; } /** - * Calculates term => frequency pairs for all terms in a supplied string - * @param string $record string of terms - * @return array $term_frequencies associative array term => count - */ - public static function termCount($record) - { - $terms = explode(" ", $record); - $term_frequencies = array_count_values($terms); - return $term_frequencies; - } - /** - * Calculates the term frequencies for users. For each post of the user, - * how often the user has seen a post with that term + * Computes the group embeddings using the item embeddings for the items in + * a group. Additionally fetches the existing group embeddings from database + * and updates them if the item embeddings are updated + * + * @param array $item_embeddings embedding for the items + * @return array $updated_group_embeddings containing embeddings for groups */ - public function computeUserTermFrequencies() + public function computeGroupEmbeddings($item_embeddings) { $db = $this->db; - $sql = "SELECT II.USER_ID AS UID," . - "COUNT(*) AS FREQUENCY, IWF.TERM_ID AS TID ". - "FROM ITEM_TERM_FREQUENCY IWF, ITEM_IMPRESSION II ". - "WHERE IWF.ITEM_ID = II.ITEM_ID ". - "GROUP BY II.USER_ID, IWF.TERM_ID"; - $results = $db->execute($sql); - $base_insert_sql = "INSERT INTO USER_TERM_FREQUENCY VALUES "; + $sql = "SELECT * FROM RECOMMENDATION_ITEM_EMBEDDING" . + " WHERE ITEM_TYPE = ?"; + $results = $db->execute($sql, [C\GROUP_RECOMMENDATION]); + $group_embeddings = []; + while ($row = $db->fetchArray($results)) { + $group_embeddings[$row['ID']] = unserialize($row['VECTOR']); + } + $updated_group_embeddings = []; + foreach ($item_embeddings as $item_id => [$embedding, $parent_id]) { + if (array_key_exists($parent_id, $updated_group_embeddings)) { + $updated_group_embeddings[$parent_id] = LinearAlgebra::add( + $embedding, $updated_group_embeddings[$parent_id]); + } else { + $updated_group_embeddings[$parent_id] = $embedding; + } + } + foreach ($updated_group_embeddings as $group_id => $embedding) { + $embedding = LinearAlgebra::normalize($embedding); + if (array_key_exists($group_id, $group_embeddings)) { + $embedding = LinearAlgebra::add($embedding, + $group_embeddings[$group_id]); + $embedding = LinearAlgebra::normalize($embedding); + unset($group_embeddings[$group_id]); + } + $updated_group_embeddings[$group_id] = $embedding; + } + foreach ($group_embeddings as $group_id => $embedding) { + $updated_group_embeddings[$group_id] = $embedding; + } + $base_insert_sql = "INSERT INTO RECOMMENDATION_ITEM_EMBEDDING VALUES "; $insert_sql = $base_insert_sql; - $insert_count = 0; - L\crawlLog("...Computing User Term Frequencies"); - $i = 0; $comma = ""; - while($row = $db->fetchArray($results)) { - $uid = $row['UID']; - $wid = $row['TID']; - $log_freq = log($row['FREQUENCY'], 10) + 1.0; - $insert_sql .= "$comma ({$row['UID']}, {$row['TID']},". - "{$row['FREQUENCY']}, $log_freq)"; + $insert_count = 0; + $item_type = C\GROUP_RECOMMENDATION; + foreach ($updated_group_embeddings as $group_id => $embedding) { + $serialized_embedding = serialize($embedding); + $insert_sql .= "$comma($group_id, $item_type," . + " '$serialized_embedding', $group_id)"; $comma = ","; $insert_count++; - L\crawlTimeoutLog("...%s user term frequencies so far", - $i++); - if ($insert_count > self::BATCH_SQL_INSERT_NUM) { - $insert_ignore_sql = $db->insertIgnore($insert_sql); - $db->execute($insert_ignore_sql); - $insert_sql = $base_insert_sql; + if ($insert_count == self::BATCH_SQL_INSERT_NUM) { + $insert_sql = $db->insertIgnore($insert_sql); + $db->execute($insert_sql); $insert_count = 0; $comma = ""; + $insert_sql = $base_insert_sql; } } if ($insert_count > 0) { - $insert_ignore_sql = $db->insertIgnore($insert_sql); - $db->execute($insert_ignore_sql); + $insert_sql = $db->insertIgnore($insert_sql); + $db->execute($insert_sql); } + return $updated_group_embeddings; } /** - * Computes inverse document frequencies for each term for each user and - * for each item. That is, for a particular term, it will compute - * the number of times a user used that term in a post/the number of - * posts by that user and take the log of the result. For items, the - * idea is similar, for each thread, one calculates the number of posts - * that the term appeared in/the total number of posts in the thread and - * take the log of the result. + * Computes the user embeddings based on the group embeddings which user + * have impression in ITEM_IMPRESSION_SUMMARY table for defined + * UPDATE_PERIOD or are a member in the group * - * @param int $number_items number of items - * @param int $number_users number of users + * @param array $group_embeddings embedding vectors of groups + * @return array [$group_user_embedding, $user_groups] user embeddings for + * groups and the groups id user have membership */ - public function computeUserItemIdf($number_items, $number_users) + public function computeGroupUserEmbeddings($group_embeddings) { $db = $this->db; - $terms_sql = "SELECT DISTINCT TERM_ID, SUM(FREQUENCY) AS FREQ ". - "FROM ITEM_TERM_FREQUENCY GROUP BY TERM_ID ". - "ORDER BY FREQ DESC " . $db->limitOffset(self::MAX_TERMS); - $results = $db->execute($terms_sql); - $num_items_term_sql = "SELECT COUNT(DISTINCT ITEM_ID)". - " AS NUM_ITEMS_TERM FROM ITEM_TERM_FREQUENCY ". - "WHERE TERM_ID = ? "; - $num_users_term_sql ="SELECT COUNT(DISTINCT USER_ID) ". - "AS NUM_USERS_TERM FROM USER_TERM_FREQUENCY ". - "WHERE TERM_ID = ? "; - $i = 0; - $item_idf =[]; - $user_idf = []; - L\crawlLog("...Computing User Item IDF values."); - while($row = $db->fetchArray($results)) { - $term_id = $row['TERM_ID']; - /* - Number of groups having the required term - */ - $num_items_results = $db->execute($num_items_term_sql, [$term_id]); - $row = $db->fetchArray($num_items_results); - $item_idf[$term_id] = - max(log($number_items/($row['NUM_ITEMS_TERM'] + 1), 10), 0); - /* - Number of users having the required term - */ - $num_users_results = $db->execute($num_users_term_sql, [$term_id]); - $row = $db->fetchArray($num_users_results); - $user_idf[$term_id] = - max(log($number_users/($row['NUM_USERS_TERM'] + 1), 10), 0); - L\crawlTimeoutLog("...%s user item IDFs so far", - $i++); - } - $this->item_idf = $item_idf; - $this->user_idf = $user_idf; + $db_list_function = in_array($db->to_upper_dbms, ["SQLITE3", "MYSQL"]) ? + "GROUP_CONCAT" : "STRING_AGG"; + $timestamp = floor(time() / self::UPDATE_PERIOD ) * self::UPDATE_PERIOD; + $condition = "ITEM_TYPE = ? AND USER_ID <> 2 AND" . + " ((UPDATE_PERIOD = ? AND UPDATE_TIMESTAMP = ?) OR" . + " (UPDATE_PERIOD = ?))"; + $impression_sql = "SELECT USER_ID, $db_list_function(ITEM_ID, ',') AS" . + " ITEM_IDS FROM ITEM_IMPRESSION_SUMMARY WHERE $condition" . + " GROUP BY USER_ID"; + $results = $db->execute($impression_sql, + [C\GROUP_IMPRESSION, self::UPDATE_PERIOD, $timestamp, + C\MOST_RECENT_VIEW]); + $group_user_embeddings = []; + $user_groups = []; + while ($row = $db->fetchArray($results)) { + $user_id = $row['USER_ID']; + $group_ids = explode(",", $row['ITEM_IDS']); + $group_ids = array_unique($group_ids); + $group_user_embeddings[$user_id] = array_fill(0, + self::EMBEDDING_VECTOR_SIZE, 0); + $user_groups[$user_id] = []; + foreach ($group_ids as $group_id) { + if (array_key_exists($group_id, $group_embeddings)) { + $group_user_embeddings[$user_id] = LinearAlgebra::add( + $group_user_embeddings[$user_id], + $group_embeddings[$group_id]); + $user_groups[$user_id][] = $group_id; + } + } + $group_user_embeddings[$user_id] = LinearAlgebra::normalize( + $group_user_embeddings[$user_id]); + } + return [$group_user_embeddings, $user_groups]; } /** - * Calculates the product TF * IDF for users based on the - * results of @see computeUserItemIdf and @see computeUserTermFrequencies + * Computes the group recommendation for user based on the cosine similarity + * between user embeddings and group embeddings. Recommendations are + * calculated for the groups whic user has not interacted with yet and + * they are not member of that group + * + * @param array $group_embeddings embeddings vector for groups + * @param array $group_user_embeddings embeddings vector for users + * @param array $user_groups groups id for user having membership + * @return array $user_group_impression group ids which user has seen */ - public function tfIdfUsers() + public function computeGroupUserRecommendations($group_embeddings, + $group_user_embeddings, $user_groups, $user_group_impression) { - L\crawlLog("...Computing TF*IDF scores for users."); $db = $this->db; - $user_idf = $this->user_idf; - $user_terms_sql = "SELECT TERM_ID, USER_ID, LOG_FREQUENCY ". - "FROM USER_TERM_FREQUENCY"; - $base_insert_sql = "INSERT INTO USER_TERM_WEIGHTS VALUES "; + $invite_groups_sql = "SELECT GROUP_ID FROM SOCIAL_GROUPS" . + " WHERE REGISTER_TYPE = ?"; + $results = $db->execute($invite_groups_sql, [C\INVITE_ONLY_JOIN]); + $exclude_group_ids = []; + while ($row = $db->fetchArray($results)) { + $exclude_group_ids[] = $row['GROUP_ID']; + } + $group_user_recommendations = []; + foreach ($group_user_embeddings as $user_id => $embedding) { + $user_group = $user_groups[$user_id]; + $impression_group = $user_group_impression[$user_id]; + foreach ($group_embeddings as $group_id => $group_embedding) { + if (in_array($group_id, $exclude_group_ids) || + in_array($group_id, $user_group) || + in_array($group_id, $impression_group)) { + continue; + } + $similarity = LinearAlgebra::similarity($embedding, + $group_embedding); + $group_user_recommendations[] = [$user_id, $group_id, + $similarity]; + } + } + $delete_sql = "DELETE FROM GROUP_ITEM_RECOMMENDATION WHERE" . + " ITEM_TYPE = ?"; + $db->execute($delete_sql, [C\GROUP_RECOMMENDATION]); + $base_insert_sql = "INSERT INTO GROUP_ITEM_RECOMMENDATION VALUES "; $insert_sql = $base_insert_sql; - $results = $db->execute($user_terms_sql); - $insert_count = 0; - $i = 0; $comma = ""; - while($row = $db->fetchArray($results)) { - L\crawlTimeoutLog("...%s user tf-idfs so far", - $i++); - if (!empty($user_idf[$row['TERM_ID']])) { - $insert_sql .= "$comma ({$row['TERM_ID']}, {$row['USER_ID']}, ". - ($row["LOG_FREQUENCY"] * $user_idf[$row['TERM_ID']]) . ")"; - $insert_count++; - $comma = ","; - } - if ($insert_count > self::BATCH_SQL_INSERT_NUM) { - $insert_ignore_sql = $db->insertIgnore($insert_sql); - $db->execute($insert_ignore_sql); - $insert_sql = $base_insert_sql; + $insert_count = 0; + $item_type = C\GROUP_RECOMMENDATION; + foreach ($group_user_recommendations as $recommendation) { + $insert_sql .= "$comma({$recommendation[0]}, {$recommendation[1]}" . + ", $item_type, {$recommendation[2]}, {$this->update_time})"; + $comma = ","; + $insert_count++; + if ($insert_count == self::BATCH_SQL_INSERT_NUM) { + $insert_sql = $db->insertIgnore($insert_sql); + $db->execute($insert_sql); $insert_count = 0; $comma = ""; + $insert_sql = $base_insert_sql; } } if ($insert_count > 0) { - $insert_ignore_sql = $db->insertIgnore($insert_sql); - $db->execute($insert_ignore_sql); + $insert_sql = $db->insertIgnore($insert_sql); + $db->execute($insert_sql); + } + } + /** + * Manages the whole process of computing wiki resource recommendations + * for users. Makes a series of calls to handle parts of this computation + * before synthesizing the result + */ + public function computeWikiResourceRecommendations() + { + L\crawlLog("...Start fetching descriptions for the wiki page " . + "resources..."); + [$descriptions, $resource_metadata] = $this-> + getWikiResourceDescriptions(); + L\crawlLog("...Finished fetching descriptions for the wiki page " . + "resources..."); + $item_embeddings = $this->getWikiResourceEmbeddings(); + L\crawlLog("...Start computing wiki term embeddings..."); + [$term_embeddings, $resource_terms, $meta_details_terms] = + $this->computeWikiTermEmbeddings($descriptions, $item_embeddings); + L\crawlLog("...Finished computing wiki term embeddings..."); + L\crawlLog("...Start computing wiki resource embeddings..."); + $item_embeddings = $this->computeWikiResourceEmbeddings($resource_terms, + $meta_details_terms, $term_embeddings, $item_embeddings); + L\crawlLog("...Finished computing wiki resource embeddings..."); + L\crawlLog("...Start computing wiki user embeddings..."); + [$user_embeddings, $user_items] = $this->computeWikiUserEmbeddings( + $item_embeddings); + L\crawlLog("...Finished computing wiki user embeddings..."); + L\crawlLog("...Start computing wiki resource recommendations..."); + $this->computeWikiUserRecommendations($item_embeddings, + $user_embeddings, $user_items, $resource_metadata); + L\crawlLog("...Done computing wiki resource recommendations..."); + } + /** + * Fetches the description for the eligible wiki resources having the root + * folder path captured in RECOMMENDATION_FILE + * + * @return array $descriptions of resources + */ + public function getWikiResourceDescriptions() + { + $thumb_folders = explode("\n", + file_get_contents(self::RECOMMENDATION_FILE)); + $thumb_folders = array_unique($thumb_folders); + $descriptions = []; + $resource_metadata = []; + foreach ($thumb_folders as $thumb_folder) { + list($group_id, $page_id, $folder) = explode("###", $thumb_folder); + $folder = trim($folder, " \n\r\t\v\x00"); + $files = $this->getDescriptionFiles($folder); + foreach ($files as $file) { + $resource_file = substr($file, 0, strlen($file) - 4); + $resource_id = unpack('n', md5($group_id . $page_id . + $resource_file, true))[1]; + if (array_key_exists($resource_id, $descriptions)) { + continue; + } + $description = file_get_contents($file); + if (strcmp($description, "Description search sources". + " failed to find description.") == 0) { + continue; + } + $descriptions[$resource_id] = $description; + $resource_metadata[$resource_id] = [$group_id, + $page_id, $resource_file]; + } + } + return [$descriptions, $resource_metadata]; + } + /** + * Returns all the resource description files in a given thumb folder and + * also recursively scan through subfolders if any + * + * @param string $thumb_folder path of a thumb folder + * @return array $files list of description files path in given folder + */ + public function getDescriptionFiles($thumb_folder) + { + if (!is_dir($thumb_folder)) { + return []; + } + $exclude_files = [".", "..", "needs_description.txt", + "subfolder_counts.txt", ".DS_Store"]; + $files = scandir($thumb_folder); + $file_paths = []; + foreach ($files as $file) { + if (in_array($file, $exclude_files)) { + continue; + } + if (is_dir($thumb_folder . "/" . $file)) { + L\crawlLog("...This is a folder, looking files inside it..."); + $sub_file_paths = $this->getDescriptionFiles( + $thumb_folder . "/" . $file); + $file_paths = array_merge($file_paths, $sub_file_paths); + } else { + $file_paths[] = $thumb_folder . "/$file"; + } + } + return $file_paths; + } + /** + * Retrieves wiki resources embeddings from the database + * + * @return array $item_embeddings embedding vector for resources + */ + public function getWikiResourceEmbeddings() + { + $db = $this->db; + $sql = "SELECT * FROM RECOMMENDATION_ITEM_EMBEDDING WHERE" . + " ITEM_TYPE = ?"; + $results = $db->execute($sql, [C\RESOURCE_RECOMMENDATION]); + $item_embeddings = []; + while ($row = $db->fetchArray($results)) { + $item_embeddings[$row['ID']] = unserialize($row['VECTOR']); } + return $item_embeddings; } /** - * Calculates the product TF * IDF for users based on the - * results of @see computeUserItemIdf and @see computeItemTermFrequencies + * Computes the embedding for new terms in the description of wiki + * resources and updates the embedding of existing terms using Hash2Vec + * approach + * + * @param array $descriptions of resources + * @param array $item_embeddings embedding vector for resources + * @return array [$term_embeddings, $resource_terms, $meta_details_term] + * first with key being term id and value is the embedding vector for that + * term, second with key being resource id and value being array of clean + * terms in that resource description */ - public function tfIdfItems() + public function computeWikiTermEmbeddings($descriptions, $item_embeddings) { - L\crawlLog("...Computing TF*IDF scores for items."); $db = $this->db; - $item_idf = $this->item_idf; - $item_terms_sql = "SELECT TERM_ID, ITEM_ID, LOG_FREQUENCY ". - "FROM ITEM_TERM_FREQUENCY"; - $base_insert_sql = "INSERT INTO ITEM_TERM_WEIGHTS VALUES "; + $select_sql = "SELECT * FROM RECOMMENDATION_TERM_EMBEDDING WHERE" . + " ITEM_TYPE = ?"; + $results = $db->execute($select_sql, [C\RESOURCE_RECOMMENDATION]); + $term_embeddings = []; + $resource_terms = []; + $meta_details_terms = []; + while ($row = $db->fetchArray($results)) { + $term_embeddings[$row['ID']] = unserialize($row['VECTOR']); + } + $context_distance_sum = (self::CONTEXT_WINDOW_LENGTH * + (self::CONTEXT_WINDOW_LENGTH + 1)) / 2.0; + $mean = $context_distance_sum / self::CONTEXT_WINDOW_LENGTH; + $carry = 0.0; + for ($i = 1; $i <= self::CONTEXT_WINDOW_LENGTH; $i++) { + $difference = $i - $mean; + $carry += $difference * $difference; + } + $std_deviation = sqrt($carry / self::CONTEXT_WINDOW_LENGTH); + foreach ($descriptions as $resource_id => $description) { + $resource_terms[$resource_id] = []; + $meta_details_terms[$resource_id] = []; + $description_parts = explode("\n", $description); + foreach ($description_parts as $description_part) { + $description_part = mb_strtolower($description_part); + $terms = $this->cleanRemoveStopWords($description_part, true); + if (count($terms) < self::CONTEXT_WINDOW_LENGTH) { + $meta_details_terms[$resource_id] = array_merge($terms, + $meta_details_terms[$resource_id]); + } else { + $resource_terms[$resource_id] = array_merge($terms, + $resource_terms[$resource_id]); + } + } + if (array_key_exists($resource_id, $item_embeddings)) { + continue; + } + if (count($resource_terms[$resource_id]) > 0) { + $terms = $resource_terms[$resource_id]; + for ($i = 0; $i < count($terms); $i++) { + [$term_id, $term] = $terms[$i]; + $term_hash = $term_id % self::EMBEDDING_VECTOR_SIZE; + if (!array_key_exists($term_id, $term_embeddings)) { + $term_embeddings[$term_id] = array_fill(0, + self::EMBEDDING_VECTOR_SIZE, 0); + } + for ($j = $i - 1; $j >= 0 && + $j >= $i - self::CONTEXT_WINDOW_LENGTH; $j--) { + [$context_term_id, $context_term] = $terms[$j]; + $weight = exp(-1 * pow(($i - $j) / $std_deviation, 2)); + $context_term_hash = $context_term_id % + self::EMBEDDING_VECTOR_SIZE; + $term_embeddings[$term_id][$context_term_hash] += + $weight; + $term_embeddings[$context_term_id][$term_hash] += + $weight; + } + } + } + } + $delete_sql = "DELETE FROM RECOMMENDATION_TERM_EMBEDDING WHERE" . + " ITEM_TYPE = ?"; + $db->execute($delete_sql, [C\RESOURCE_RECOMMENDATION]); + $base_insert_sql = "INSERT INTO RECOMMENDATION_TERM_EMBEDDING VALUES "; $insert_sql = $base_insert_sql; - $results = $db->execute($item_terms_sql); - $insert_count = 0; - $i = 0; $comma = ""; - while($row = $db->fetchArray($results)) { - L\crawlTimeoutLog("...%s term tf-idfs so far", - $i++); - if (!empty($item_idf[$row['TERM_ID']])) { - $insert_sql .= "$comma ({$row['TERM_ID']}, {$row['ITEM_ID']}, ". - ($row["LOG_FREQUENCY"] * $item_idf[$row['TERM_ID']]) . ")"; - $insert_count++; - $comma = ","; - } - if ($insert_count > self::BATCH_SQL_INSERT_NUM) { - $insert_ignore_sql = $db->insertIgnore($insert_sql); - $db->execute($insert_ignore_sql); - $insert_sql = $base_insert_sql; + $insert_count = 0; + $item_type = C\RESOURCE_RECOMMENDATION; + foreach ($term_embeddings as $term_id => $embedding) { + $serialized_embedding = serialize($embedding); + $insert_sql .= "$comma($term_id, $item_type, " . + "'$serialized_embedding')"; + $comma = ","; + $insert_count++; + if ($insert_count == self::BATCH_SQL_INSERT_NUM) { + $insert_sql = $db->insertIgnore($insert_sql); + $db->execute($insert_sql); $insert_count = 0; $comma = ""; + $insert_sql = $base_insert_sql; } } if ($insert_count > 0) { - $insert_ignore_sql = $db->insertIgnore($insert_sql); - $db->execute($insert_ignore_sql); + $insert_sql = $db->insertIgnore($insert_sql); + $db->execute($insert_sql); + } + return [$term_embeddings, $resource_terms, $meta_details_terms]; + } + /** + * Split the given text into terms, clean the terms by removing non + * alphanumeric characters and remove the stop terms in order to reduce the + * noise while calculating the embeddings + * + * @param string $text which needs to be processed + * @param boolean $description_stop_word_flag to remove + * words present in DESCRIPTION_STOP_WORDS + * @return array $terms [term_id, term] term_id calculated using md5 hash + * for the term + */ + public function cleanRemoveStopWords($text, + $description_stop_word_flag = false) + { + $raw_terms = preg_split("/[\s,\/\._-]+/", $text); + $terms = []; + foreach ($raw_terms as $term) { + $term = preg_replace("/\W/", "", $term); + $term = preg_replace("/&rsquo/", "'", $term); + $term = str_replace(['"', "'"], "", $term); + if (strlen($term) > 0) { + $terms[] = $term; + } + } + $text_locale = L\guessLocaleFromString($text); + $stop_obj = PhraseParser::getTokenizer($text_locale); + if ($stop_obj && method_exists($stop_obj, "stoptermsRemover")) { + $terms = $stop_obj->stoptermsRemover($terms); + } + $term_ids = []; + foreach ($terms as $term) { + if ($description_stop_word_flag && + in_array($term, self::DESCRIPTION_STOP_WORDS)) { + continue; + } + $term_id = unpack('n', md5($term, true))[1]; + $term_ids[] = [$term_id, $term]; } + return $term_ids; } /** - * Computes the cosine similarity between users and particular threads - * based on TF*IDF scores and inserts the result into USER_ITEM_SIMILARITY + * Computes the embeddings for wiki page resources using the calculated + * term embeddings and add the metadata details separately to the embeddings + * + * @param array $resource_terms of processed terms from resource description + * @param array $meta_details_terms of raw resource descriptions + * @param array $term_embeddings of term embeddings + * @param array $item_embeddings of existing wiki resource embeddings + * @return array $updated_item_embeddings array of updated wiki resource + * embeddings */ - public function computeUserItemSimilarity() + public function computeWikiResourceEmbeddings($resource_terms, + $meta_details_terms, $term_embeddings, $item_embeddings) { - L\crawlLog("...Computing User Item Similarity Scores."); $db = $this->db; - $similarity_parts_sql = - "SELECT SUM(UTW.WEIGHT * ITW.WEIGHT) AS THREAD_DOT_USER, ". - "SUM(UTW.WEIGHT * UTW.WEIGHT) AS USER_MAG," . - "SUM(ITW.WEIGHT * ITW.WEIGHT) AS ITEM_MAG," . - "GI.PARENT_ID AS THREAD_ID, UTW.USER_ID AS USER_ID ". - "FROM ITEM_TERM_WEIGHTS ITW, USER_TERM_WEIGHTS UTW, GROUP_ITEM GI ". - "WHERE GI.ID = ITW.ITEM_ID AND UTW.TERM_ID=ITW.TERM_ID " . - "GROUP BY UTW.USER_ID, GI.PARENT_ID"; - $similarity_parts_result = $db->execute($similarity_parts_sql); - //used to check if belong to group - $member_info_sql = "SELECT GI.GROUP_ID FROM ". - "USER_GROUP UG, GROUP_ITEM GI WHERE ". - "UG.GROUP_ID = GI.GROUP_ID AND LOWER(GI.TITLE) ". - "NOT LIKE '%page%' AND UG.USER_ID = ? AND GI.ID = ?"; - //used to check if can join group easily - $register_info_sql = "SELECT G.GROUP_ID, G.REGISTER_TYPE AS REGISTER ". - "FROM SOCIAL_GROUPS G, GROUP_ITEM GI WHERE ". - "G.GROUP_ID = GI.GROUP_ID AND GI.ID = ? "; - $insert_count = 0; - $base_sql = "INSERT INTO USER_ITEM_SIMILARITY VALUES "; - $insert_sql = $base_sql; - $comma = ""; - $i = 0; - while($row = $db->fetchArray($similarity_parts_result)) { - list($item_dot_user, $user_magnitude, - $item_magnitude, $thread_id, $user_id,) = array_values($row); - $user_magnitude = sqrt($user_magnitude); - $item_magnitude = sqrt($item_magnitude); - $add_record = false; - if ($result = $db->execute($member_info_sql, [$user_id, - $thread_id])){ - $info_row = $db->fetchArray($result); - if (!empty($info_row) && $item_dot_user > 0) { - $add_record = true; - $group_member = 1; - } else { - $access_results = - $db->execute($register_info_sql, [$thread_id]); - if ($access_results && - $access_row = $db->fetchArray($access_results)) { - if (in_array($access_row['REGISTER'], - [C\PUBLIC_BROWSE_REQUEST_JOIN, C\PUBLIC_JOIN])) { - $add_record = true; - $group_member = 0; - } - } + $updated_item_embeddings = []; + foreach ($resource_terms as $resource_id => $terms) { + $item_embedding = array_fill(0, self::EMBEDDING_VECTOR_SIZE, 0); + foreach ($terms as [$term_id, $term]) { + if (array_key_exists($term_id, $term_embeddings)) { + $item_embedding = LinearAlgebra::add($item_embedding, + $term_embeddings[$term_id]); } } - L\crawlTimeoutLog("...%s similarity scores so far", $i++); - if ($add_record) { - $cos_sim = floatval($item_dot_user) - /floatval($user_magnitude * $item_magnitude); - $insert_count++; - $insert_sql .= "$comma ($user_id, $thread_id, $cos_sim, - $group_member)"; - $comma = ","; - if ($insert_count > self::BATCH_SQL_INSERT_NUM) { - $insert_ignore_sql = $db->insertIgnore($insert_sql); - $db->execute($insert_ignore_sql); - $insert_sql = $base_sql; - $insert_count = 0; - $comma = ""; + $updated_item_embeddings[$resource_id] = $item_embedding; + if (array_key_exists($resource_id, $item_embeddings)) { + unset($item_embeddings[$resource_id]); + } + } + foreach ($item_embeddings as $resource_id => $embedding) { + $updated_item_embeddings[$resource_id] = $embedding; + } + foreach ($meta_details_terms as $resource_id => $meta_terms) { + if (!array_key_exists($resource_id, $updated_item_embeddings)) { + $updated_item_embeddings[$resource_id] = array_fill(0, + self::EMBEDDING_VECTOR_SIZE, 0); + } + foreach ($meta_terms as [$meta_term_id, $meta_term]) { + if (strlen($meta_term) <= 1) { + continue; } + $updated_item_embeddings[$resource_id][$meta_term_id] += 1.; + } + } + foreach ($updated_item_embeddings as $item_id => $embedding) { + $updated_item_embeddings[$item_id] = LinearAlgebra::normalize( + $embedding); + } + $delete_sql = "DELETE FROM RECOMMENDATION_ITEM_EMBEDDING WHERE" . + " ITEM_TYPE = ?"; + $db->execute($delete_sql, [C\RESOURCE_RECOMMENDATION]); + $base_insert_sql = "INSERT INTO RECOMMENDATION_ITEM_EMBEDDING VALUES "; + $insert_sql = $base_insert_sql; + $comma = ""; + $insert_count = 0; + $item_type = C\RESOURCE_RECOMMENDATION; + foreach ($updated_item_embeddings as $resource_id => $embedding) { + $serialized_embedding = serialize($embedding); + $insert_sql .= "$comma($resource_id, $item_type," . + " '$serialized_embedding', $resource_id)"; + $comma = ","; + $insert_count++; + if ($insert_count == self::BATCH_SQL_INSERT_NUM) { + $insert_sql = $db->insertIgnore($insert_sql); + $db->execute($insert_sql); + $insert_count = 0; + $comma = ""; + $insert_sql = $base_insert_sql; } } if ($insert_count > 0) { - $insert_ignore_sql = $db->insertIgnore($insert_sql); - $db->execute($insert_ignore_sql); + $insert_sql = $db->insertIgnore($insert_sql); + $db->execute($insert_sql); + } + return $updated_item_embeddings; + } + /** + * Computes user embeddings for wiki resources based on the user's resources + * impression logged in ITEM_IMPRESSION_SUMMARY table for the defined update + * period + * + * @param array $item_embeddings of wiki page resources embedding + * @return array [$user_embeddings, $user_items] of user embeddings + * for wiki resources and the user resource impression + */ + public function computeWikiUserEmbeddings($item_embeddings) + { + $db = $this->db; + $db_list_function = in_array($db->to_upper_dbms, ["SQLITE3", "MYSQL"]) ? + "GROUP_CONCAT" : "STRING_AGG"; + $timestamp = floor(time() / self::UPDATE_PERIOD ) * self::UPDATE_PERIOD; + $condition = "ITEM_TYPE = ? AND USER_ID <> 2 AND" . + " ((UPDATE_PERIOD = ? AND UPDATE_TIMESTAMP = ?) OR" . + " (UPDATE_PERIOD = ?))"; + $impression_sql = "SELECT USER_ID, $db_list_function(ITEM_ID, ',') AS" . + " ITEM_IDS FROM ITEM_IMPRESSION_SUMMARY WHERE $condition" . + " GROUP BY USER_ID"; + $results = $db->execute($impression_sql, + [C\RESOURCE_IMPRESSION, self::UPDATE_PERIOD, $timestamp, + C\MOST_RECENT_VIEW]); + $user_embeddings = []; + $user_items = []; + while ($row = $db->fetchArray($results)) { + $user_id = $row['USER_ID']; + $item_ids = explode(",", $row['ITEM_IDS']); + $item_ids = array_unique($item_ids); + $user_embeddings[$user_id] = array_fill(0, + self::EMBEDDING_VECTOR_SIZE, 0); + $user_items[$user_id] = []; + foreach ($item_ids as $item_id) { + if (array_key_exists($item_id, $item_embeddings)) { + $user_embeddings[$user_id] = LinearAlgebra::add( + $user_embeddings[$user_id], $item_embeddings[$item_id]); + $user_items[$user_id][] = $item_id; + } + } + $user_embeddings[$user_id] = LinearAlgebra::normalize( + $user_embeddings[$user_id]); } + return [$user_embeddings, $user_items]; } /** - * Computes up to $max_recommendations item recommendations of the given - * type (thread or group) based on query which computes similarity score - * between a user and a given type. - * @param int $recommendation_type a config.php constant indicating the type - * of recommendation to compute - * @param $similarity_sql query used to determine user similarity scores - * should output triples: (user_id item_id rating) - * @param int $max_recommendations maximum number of recommendations to - * compute per user - */ - public function calculateSimilarityRecommendations($recommendation_type, - $similarity_sql, $max_recommendations) + * Computes the wiki resource recommendations based on cosine similarity + * between resource embeddings and user embeddings + * + * @param array $item_embeddings of wiki resources embeddings + * @param array $user_embeddings of users consumed wiki resources + * embeddings + * @param array $user_items of users consumed wiki resources + */ + public function computeWikiUserRecommendations($item_embeddings, + $user_embeddings, $user_items, $resource_metadata) { $db = $this->db; - $base_sql = "INSERT INTO ITEM_RECOMMENDATION VALUES"; - $insert_sql = $base_sql; - $similarity_results = $db->execute($similarity_sql); - if (!$similarity_results) { - return; + $recommendations = []; + foreach ($user_embeddings as $user_id => $user_embedding) { + if (array_key_exists($user_id, $user_items)) { + $user_item = $user_items[$user_id]; + } else { + $user_item = []; + } + foreach ($item_embeddings as $item_id => $item_embedding) { + if (in_array($item_id, $user_item) || + !array_key_exists($item_id, $resource_metadata)) { + continue; + } + $similarity = LinearAlgebra::similarity($user_embedding, + $item_embedding); + list($group_id, $page_id, $resource_path) = + $resource_metadata[$item_id]; + $recommendations[] = [$user_id, $group_id, $page_id, + $resource_path, $similarity, $item_id]; + } } - $old_user_id = -1; // assume no one has this id + $delete_sql = "DELETE FROM GROUP_RESOURCE_RECOMMENDATION"; + $db->execute($delete_sql); + $base_insert_sql = "INSERT INTO GROUP_RESOURCE_RECOMMENDATION " . + "VALUES "; + $insert_sql = $base_insert_sql; $comma = ""; $insert_count = 0; - $i = 0; - L\crawlLog("...Computing type: $recommendation_type ". - "recommendations"); - while($row = $db->fetchArray($similarity_results)) { - list($user_id, $item_id, $similarity, ) = array_values($row); - if ($user_id != $old_user_id) { - $old_user_id = $user_id; - $num_recommended = 1; - } - if ($num_recommended <= $max_recommendations - && $old_user_id == $user_id) { - $insert_sql .= "$comma ($item_id, $user_id, " . - $recommendation_type . - ", $similarity, {$this->update_time})"; - $comma = ","; - $insert_count++; - if ($insert_count > self::BATCH_SQL_INSERT_NUM) { - $insert_ignore_sql = $db->insertIgnore($insert_sql); - $db->execute($insert_ignore_sql); - $insert_sql = $base_sql; - $insert_count = 0; - $comma = ""; - } - $num_recommended++; - $old_user_id = $user_id; + foreach ($recommendations as $recommendation) { + list($user_id, $group_id, $page_id, $resource_path, + $score, $item_id) = $recommendation; + $time = $this->update_time; + $insert_sql .= "$comma($user_id, $group_id, $page_id, " . + "\"$resource_path\", $score, $time, $item_id)"; + $comma = ","; + $insert_count++; + if ($insert_count == self::BATCH_SQL_INSERT_NUM) { + $insert_sql = $db->insertIgnore($insert_sql); + $db->execute($insert_sql); + $insert_count = 0; + $comma = ""; + $insert_sql = $base_insert_sql; } - L\crawlTimeoutLog("...%s recommendations so far", $i++); } if ($insert_count > 0) { - $insert_ignore_sql = $db->insertIgnore($insert_sql); - $db->execute($insert_ignore_sql); + $insert_sql = $db->insertIgnore($insert_sql); + $db->execute($insert_sql); } } } diff --git a/src/locale/en_US/configure.ini b/src/locale/en_US/configure.ini index 1da5ff1cd..4392a44bd 100644 --- a/src/locale/en_US/configure.ini +++ b/src/locale/en_US/configure.ini @@ -1073,6 +1073,7 @@ manageaccount_element_more_groups = "More groups..." manageaccount_element_recommendations = "Recommendations" manageaccount_element_rec_threads = "Threads:" manageaccount_element_rec_groups = "Groups:" +manageaccount_element_rec_resources = "Page Resources:" manageaccount_element_account_details = "Account Details" manageaccount_element_username = "Username" manageaccount_element_firstname = "First Name" diff --git a/src/models/ProfileModel.php b/src/models/ProfileModel.php index e301f3be3..a82579b93 100755 --- a/src/models/ProfileModel.php +++ b/src/models/ProfileModel.php @@ -210,6 +210,10 @@ class ProfileModel extends Model GI_PARENT_ID_PUBDATE_INDEX ON GROUP_ITEM (PARENT_ID, PUBDATE)", "GROUP_ITEM_VOTE" => "CREATE TABLE GROUP_ITEM_VOTE( USER_ID $integer, ITEM_ID $integer)", + "GROUP_ITEM_RECOMMENDATION" => "CREATE TABLE + GROUP_ITEM_RECOMMENDATION (USER_ID $integer NOT NULL, + ITEM_ID $integer NOT NULL, ITEM_TYPE $integer NOT NULL, + SCORE FLOAT, TIMESTAMP NUMERIC(" . C\TIMESTAMP_LEN . "))", "GROUP_PAGE" => "CREATE TABLE GROUP_PAGE ( ID $serial PRIMARY KEY $auto_increment, GROUP_ID $integer, DISCUSS_THREAD $integer, TITLE VARCHAR(" . C\TITLE_LEN . "), @@ -237,6 +241,11 @@ class ProfileModel extends Model TO_PAGE_NAME))", "GPP_ID_INDEX" => "CREATE INDEX GP_PRE_INDEX ON GROUP_PAGE_PRE_LINK (TO_GROUP_ID, TO_PAGE_NAME)", + "GROUP_RESOURCE_RECOMMENDATION" => "CREATE TABLE + GROUP_RESOURCE_RECOMMENDATION (USER_ID $integer NOT NULL, + GROUP_ID $integer NOT NULL, PAGE_ID $integer NOT NULL, + RESOURCE_PATH VARCHAR(255), SCORE FLOAT, TIMESTAMP NUMERIC(" . + C\TIMESTAMP_LEN . "), RESOURCE_ID $integer)", "SOCIAL_GROUPS" => "CREATE TABLE SOCIAL_GROUPS ( GROUP_ID $serial PRIMARY KEY $auto_increment, GROUP_NAME VARCHAR(" . C\SHORT_TITLE_LEN @@ -270,19 +279,6 @@ class ProfileModel extends Model TMP_NUM_VIEWS $integer DEFAULT -1, PRIMARY KEY(USER_ID, ITEM_ID, ITEM_TYPE, UPDATE_PERIOD, UPDATE_TIMESTAMP))", - "ITEM_RECOMMENDATION" => "CREATE TABLE ITEM_RECOMMENDATION ( - ITEM_ID $integer, USER_ID $integer, ITEM_TYPE $integer, - SCORE FLOAT, TIMESTAMP NUMERIC(" . C\TIMESTAMP_LEN . "))", - "IR_USER_ID_INDEX" => "CREATE INDEX IR_USER_ID_INDEX ON - ITEM_RECOMMENDATION(USER_ID)", - "ITEM_TERM_FREQUENCY"=> "CREATE TABLE ITEM_TERM_FREQUENCY - (ITEM_ID $integer, TERM_ID $integer, FREQUENCY $integer, - LOG_FREQUENCY FLOAT, PRIMARY KEY(ITEM_ID, TERM_ID))", - "ITF_TERM_ID_INDEX" => "CREATE INDEX ITF_TERM_ID_INDEX ON - ITEM_TERM_FREQUENCY(TERM_ID)", - "ITEM_TERM_WEIGHTS"=> "CREATE TABLE ITEM_TERM_WEIGHTS ( - TERM_ID $integer, ITEM_ID $integer, WEIGHT FLOAT, - PRIMARY KEY(TERM_ID, ITEM_ID))", "LOCALE" => "CREATE TABLE LOCALE(LOCALE_ID $serial PRIMARY KEY $auto_increment, LOCALE_TAG VARCHAR(" . C\NAME_LEN . "), LOCALE_NAME VARCHAR(" . C\LONG_NAME_LEN . @@ -329,6 +325,14 @@ class ProfileModel extends Model ACTIVITY_ID $integer, ALLOWED_ARGUMENTS VARCHAR(" . C\MAX_URL_LEN . ") DEFAULT 'all', PRIMARY KEY(ROLE_ID, ACTIVITY_ID))", + "RECOMMENDATION_ITEM_EMBEDDING" => "CREATE TABLE + RECOMMENDATION_ITEM_EMBEDDING (ID $integer NOT NULL, + ITEM_TYPE $integer NOT NULL, VECTOR BLOB, PARENT_ID $integer, + PRIMARY KEY(ID, ITEM_TYPE))", + "RECOMMENDATION_TERM_EMBEDDING" => "CREATE TABLE + RECOMMENDATION_TERM_EMBEDDING (ID $integer NOT NULL, + ITEM_TYPE $integer NOT NULL, VECTOR BLOB, + PRIMARY KEY(ID, ITEM_TYPE))", "SCRAPER" => "CREATE TABLE SCRAPER (ID $serial PRIMARY KEY $auto_increment, NAME VARCHAR(" . C\TITLE_LEN . "), @@ -382,18 +386,6 @@ class ProfileModel extends Model ROLE_ID $integer, PRIMARY KEY (ROLE_ID, USER_ID))", "USER_SESSION" => "CREATE TABLE USER_SESSION ( USER_ID $integer PRIMARY KEY, SESSION $user_session_text)", - "USER_ITEM_SIMILARITY" => "CREATE TABLE USER_ITEM_SIMILARITY - (USER_ID $integer, THREAD_ID $integer, SIMILARITY FLOAT, - GROUP_MEMBER $integer, - PRIMARY KEY(USER_ID, THREAD_ID))", - "USER_TERM_FREQUENCY"=>"CREATE TABLE USER_TERM_FREQUENCY - (USER_ID $integer, TERM_ID $integer, FREQUENCY $integer, - LOG_FREQUENCY FLOAT, PRIMARY KEY(USER_ID, TERM_ID))", - "UTF_TERM_ID_INDEX" => "CREATE INDEX UTF_TERM_ID_INDEX ON - USER_TERM_FREQUENCY(TERM_ID)", - "USER_TERM_WEIGHTS"=>"CREATE TABLE USER_TERM_WEIGHTS - (TERM_ID $integer, USER_ID $integer, WEIGHT FLOAT, - PRIMARY KEY(TERM_ID, USER_ID))", "VISITOR" => "CREATE TABLE VISITOR(ADDRESS VARCHAR(". C\MAX_IP_ADDRESS_AS_STRING_LEN . "), PAGE_NAME VARCHAR(" . C\NAME_LEN . "), diff --git a/src/models/UserModel.php b/src/models/UserModel.php index 3574128aa..934b4a0b6 100755 --- a/src/models/UserModel.php +++ b/src/models/UserModel.php @@ -618,7 +618,7 @@ class UserModel extends Model $name_id = "GROUP_ID"; } $sql = "SELECT IR.ITEM_ID AS ID, NT.$name_column AS NAME ". - "FROM ITEM_RECOMMENDATION IR, $name_table NT ". + "FROM GROUP_ITEM_RECOMMENDATION IR, $name_table NT ". "WHERE IR.ITEM_ID = NT.$name_id AND IR.USER_ID = ? AND " . "ITEM_TYPE = ? AND TIMESTAMP = $timestamp " . "ORDER BY SCORE DESC " . $db->limitOffset($num); @@ -629,4 +629,29 @@ class UserModel extends Model } return $recommendations; } + public function getResourceRecommendations($user_id, $num = 3) + { + $db = $this->db; + $sql = "SELECT * FROM GROUP_RESOURCE_RECOMMENDATION WHERE" . + " USER_ID = ? ORDER BY SCORE DESC " . $db->limitOffset($num); + $results = $db->execute($sql, [$user_id]); + $recommendations = []; + while($row = $db->fetchArray($results)) { + $group_id = $row['GROUP_ID']; + $page_id = $row['PAGE_ID']; + $page_sql = "SELECT TITLE FROM GROUP_PAGE WHERE ID = ?"; + $result = $db->execute($page_sql, [$page_id]); + while ($sub_row = $db->fetchArray($result)) { + $page_title = $sub_row['TITLE']; + } + $index = strrpos($row['RESOURCE_PATH'], "/"); + $name = substr($row['RESOURCE_PATH'], $index + 1); + $resource_index = strrpos($row['RESOURCE_PATH'], "/resources/"); + $sub_path = substr($row['RESOURCE_PATH'], $resource_index + 28, + $index - $resource_index - 28); + $recommendations[] = [$group_id, $page_id, $page_title, + $name, $sub_path]; + } + return $recommendations; + } } diff --git a/src/views/elements/ManageaccountElement.php b/src/views/elements/ManageaccountElement.php index a94b518b9..4a95f6382 100755 --- a/src/views/elements/ManageaccountElement.php +++ b/src/views/elements/ManageaccountElement.php @@ -95,7 +95,8 @@ class ManageaccountElement extends Element tl('manageaccount_element_more_groups')?></a></b></div><?php } if (!empty($data['THREAD_RECOMMENDATIONS']) || - !empty($data['GROUP_RECOMMENDATIONS'])) { + !empty($data['GROUP_RECOMMENDATIONS']) || + !empty($data['RESOURCE_RECOMMENDATIONS'])) { ?> <h2><?=tl('manageaccount_element_recommendations')?></h2> <div class="access-result"> @@ -123,7 +124,22 @@ class ManageaccountElement extends Element <?php }?></ul><?php } - ?></div><?php + ?><br /><?php + if (!empty($data['RESOURCE_RECOMMENDATIONS'])) { + ?><b><?=tl('manageaccount_element_rec_resources') + ?></b><ul><?php + foreach ($data['RESOURCE_RECOMMENDATIONS'] as + $recommendation) { + $encoded_name = str_replace(" ", + "+", $recommendation[3]); ?><li> + <a href="<?= htmlentities(B\controllerUrl("group")) . + "/$recommendation[0]/$recommendation[2]?" . + "$token&page_id=$recommendation[1]&" . + "sf=$recommendation[4]&arg=media&n=$encoded_name" + ?>" ><?= $recommendation[3] ?></a></li> + <?php + }?></div><?php + } } ?> </div> @@ -320,4 +336,4 @@ class ManageaccountElement extends Element </div> </div><?php } -} +} \ No newline at end of file