diff --git a/src/configs/Config.php b/src/configs/Config.php
index 380c9c2ad..f02b2f5d9 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -162,7 +162,7 @@ nsconddefine('GENERATOR_STRING', "Yioop");
* Version number for upgrade database function
* @var int
*/
-nsdefine('DATABASE_VERSION', 72);
+nsdefine('DATABASE_VERSION', 73);
/**
* Minimum Version fo Yioop for which keyword ad script
* still works with this version
@@ -1225,3 +1225,5 @@ nsconddefine('SENTENCE_COMPRESSION_ENABLED', false);
nsconddefine('NUM_LEX_BULK_INSERTS',100000);
/** Length of advertisement credits service account id string*/
nsconddefine('AD_CREDITS_SERVICE_ACCOUNT_LEN', 32);
+/** Type used to indicate recommendation scheme */
+nsdefine('RECOMMENDATION_TYPE', 1);
\ No newline at end of file
diff --git a/src/library/VersionFunctions.php b/src/library/VersionFunctions.php
index 42e6c1bd9..b37a3b902 100644
--- a/src/library/VersionFunctions.php
+++ b/src/library/VersionFunctions.php
@@ -2019,3 +2019,23 @@ function upgradeDatabaseVersion72(&$db)
$db->execute("ALTER TABLE USERS ADD COLUMN AD_CREDITS_SERVICE_ACCOUNT
VARCHAR(" . C\AD_CREDITS_SERVICE_ACCOUNT_LEN . ")");
}
+/**
+ * Upgrades a Version 72 version of the Yioop database to a Version 73 version
+ * @param object $db datasource to use to upgrade
+ */
+function upgradeDatabaseVersion73(&$db)
+{
+ $dbinfo = ["DBMS" => C\DBMS, "DB_HOST" => C\DB_HOST,
+ "DB_NAME" => C\DB_NAME, "DB_PASSWORD" => C\DB_PASSWORD];
+ $integer = $db->integerType($dbinfo);
+ $db->execute("DROP TABLE IF EXISTS USER_TERM_WEIGHTS_HASH2VEC");
+ $db->execute("DROP TABLE IF EXISTS USER_ITEM_SIMILARITY_HASH2VEC");
+ $db->execute("DROP TABLE IF EXISTS HASH2VEC_TERM_SIMILARITY");
+ $db->execute("CREATE TABLE USER_TERM_WEIGHTS_HASH2VEC(TERM_ID $integer,
+ USER_ID $integer, WEIGHT FLOAT, PRIMARY KEY(TERM_ID, USER_ID))");
+ $db->execute("CREATE TABLE USER_ITEM_SIMILARITY_HASH2VEC(USER_ID $integer,
+ THREAD_ID $integer, SIMILARITY FLOAT, GROUP_MEMBER $integer,
+ PRIMARY KEY(USER_ID, THREAD_ID))");
+ $db->execute("CREATE TABLE HASH2VEC_TERM_SIMILARITY(TERM1 $integer,
+ TERM2 $integer, SCORE FLOAT, PRIMARY KEY(TERM1, TERM2))");
+}
diff --git a/src/library/media_jobs/RecommendationJob.php b/src/library/media_jobs/RecommendationJob.php
index 2ea4dbff0..db6758dc2 100644
--- a/src/library/media_jobs/RecommendationJob.php
+++ b/src/library/media_jobs/RecommendationJob.php
@@ -76,6 +76,36 @@ class RecommendationJob extends MediaJob
* Maximum number of terms used in making recommendations
*/
const MAX_TERMS = 20000;
+ /**
+ * Regular expression pattern to clean words in data corpus
+ * for HASH2VEC approach
+ */
+ const WORD_SPLIT_PATTERN = "/([.,!?\"':;)(])+/";
+ /**
+ * Dimensions of HASH2VEC vectors generated for words in data corpus
+ */
+ const HASH2VEC_VECTOR_LENGTH = 200;
+ /**
+ * Length of context window for HASH2VEC similarity calculation
+ */
+ const CONTEXT_WIDTH = 5;
+ /**
+ * Number of similar words to keep for a word in HASH2VEC
+ */
+ const MAX_SIMILAR_WORDS = 10;
+ /**
+ * Array of the generated HASH2VEC vectors for words in the data corpus
+ * @var array
+ */
+ public $hash2vec_vectors = [];
+ /**
+ * Associative array of words in the data corpus for HASH2VEC
+ */
+ public $hash2vec_words_dictionary = [];
+ /**
+ * Associative array of number of word in the data corpus for HASH2VEC
+ */
+ public $hash2vec_words_reverse_dictionary = [];
/**
* Sets up the database connection so can access tables related
* to recommendations. Initialize timing info related to job.
@@ -200,6 +230,9 @@ class RecommendationJob extends MediaJob
$this->computeUserItemIdf($number_items, $number_users);
$this->tfIdfUsers();
$this->tfIdfItems();
+ if (C\RECOMMENDATION_TYPE == 1) {
+ $this->initializeHash2Vec();
+ }
$this->computeUserItemSimilarity();
$not_belongs_subselect = "NOT EXISTS (SELECT * FROM ".
"GROUP_ITEM B WHERE S.USER_ID=B.USER_ID ".
@@ -491,14 +524,32 @@ class RecommendationJob extends MediaJob
{
L\crawlLog("...Computing User Item Similarity Scores.");
$db = $this->db;
+ $user_weight_table = "USER_TERM_WEIGHTS";
+ if (C\RECOMMENDATION_TYPE == 1) {
+ $this->db->execute("INSERT INTO USER_TERM_WEIGHTS_HASH2VEC
+ SELECT * FROM USER_TERM_WEIGHTS");
+ $this->db->execute("UPDATE USER_TERM_WEIGHTS_HASH2VEC
+ SET WEIGHT=COALESCE((SELECT SUM(HTS.SCORE*UTW.WEIGHT)
+ FROM HASH2VEC_TERM_SIMILARITY HTS, USER_TERM_WEIGHTS UTW
+ WHERE HTS.TERM2 = UTW.TERM_ID AND HTS.TERM1 =
+ USER_TERM_WEIGHTS_HASH2VEC.TERM_ID),0)");
+ $this->db->execute("UPDATE USER_TERM_WEIGHTS_HASH2VEC
+ SET WEIGHT=WEIGHT+COALESCE((SELECT UTW.WEIGHT
+ FROM USER_TERM_WEIGHTS UTW WHERE
+ USER_TERM_WEIGHTS_HASH2VEC.TERM_ID = UTW.TERM_ID
+ AND USER_TERM_WEIGHTS_HASH2VEC.USER_ID = UTW.USER_ID),0)");
+ $this->db->execute("INSERT INTO USER_ITEM_SIMILARITY_HASH2VEC
+ SELECT * FROM USER_ITEM_SIMILARITY");
+ $user_weight_table = "USER_TERM_WEIGHTS_HASH2VEC";
+ }
$similarity_parts_sql =
"SELECT SUM(UTW.WEIGHT * ITW.WEIGHT) AS THREAD_DOT_USER, ".
"SUM(UTW.WEIGHT * UTW.WEIGHT) AS USER_MAG," .
"SUM(ITW.WEIGHT * ITW.WEIGHT) AS ITEM_MAG," .
"GI.PARENT_ID AS THREAD_ID, UTW.USER_ID AS USER_ID ".
- "FROM ITEM_TERM_WEIGHTS ITW, USER_TERM_WEIGHTS UTW, GROUP_ITEM GI ".
- "WHERE GI.ID = ITW.ITEM_ID AND UTW.TERM_ID=ITW.TERM_ID " .
- "GROUP BY UTW.USER_ID, GI.PARENT_ID";
+ "FROM ITEM_TERM_WEIGHTS ITW, $user_weight_table UTW, ".
+ "GROUP_ITEM GI WHERE GI.ID = ITW.ITEM_ID AND ".
+ "UTW.TERM_ID=ITW.TERM_ID GROUP BY UTW.USER_ID, GI.PARENT_ID";
$similarity_parts_result = $db->execute($similarity_parts_sql);
//used to check if belong to group
$member_info_sql = "SELECT GI.GROUP_ID FROM ".
@@ -618,4 +669,302 @@ class RecommendationJob extends MediaJob
$db->execute($insert_ignore_sql);
}
}
+ /**
+ * Initializes data corpus for HASH2VEC recommendation approach. The data
+ * consists of concated title and description text of the group items
+ * separated by new line character for previous item
+ */
+ public function initializeHash2Vec()
+ {
+ L\crawlLog("...Initializing Hash2Vec.");
+ $db = $this->db;
+ $data_corpus = "";
+ $group_item_sql = "SELECT ID AS ITEM_ID, TITLE, DESCRIPTION ".
+ "FROM GROUP_ITEM ".
+ "WHERE LOWER(TITLE) NOT LIKE '%page%'" .
+ "AND LOWER(DESCRIPTION) NOT LIKE '%-0700%'" .
+ "ORDER BY PUBDATE DESC " . $db->limitOffset(self::MAX_GROUP_ITEMS);
+ $results = $db->execute($group_item_sql);
+ while ($item = $db->fetchArray($results)) {
+ $data_corpus .= $item['TITLE']. " ";
+ $data_corpus .= $item['DESCRIPTION'] . "\n";
+ }
+ $this->generateVectors($data_corpus);
+ }
+ /**
+ * Generates the HASH2VEC vectors for words in the given data corpus
+ *
+ * @param string $data_corpus the data corpus of group items in the form
+ * title + description for each item per line
+ */
+ public function generateVectors($data_corpus)
+ {
+ L\crawlLog("...Generating Hash2Vec Vectors.");
+ for ($i=0; $i<self::CONTEXT_WIDTH; $i++) {
+ $context_distance_vector[] = -$i + self::CONTEXT_WIDTH;
+ }
+ for ($i=0; $i<self::CONTEXT_WIDTH; $i++) {
+ $context_distance_vector[] = $i;
+ }
+ $standard_deviation = $this->calculateStandardDeviation(
+ $context_distance_vector);
+ $word_id = 0;
+ $data_lines = explode("\n", strtolower($data_corpus));
+ foreach ($data_lines as $line) {
+ $line = preg_replace("/[\n\r]/",'',$line);
+ if (strlen($line) == 0) {
+ continue;
+ }
+ $words = explode(" ", strtolower($line));
+ if (count($words) <= 1) { continue; }
+ $clean_words = [];
+ foreach ($words as $word) {
+ if ($word){
+ $clean_word = preg_replace(
+ self::WORD_SPLIT_PATTERN, '', $word);
+ $clean_words[] = $clean_word;
+ }
+ }
+ $word_ids = [];
+ foreach ($clean_words as $word) {
+ $word_ids[] = $this->wordToId($word, $word_id);
+ $word_id += 1;
+ }
+ $word_index = 0;
+ foreach ($word_ids as $id) {
+ list($context_words, $distances) =
+ $this->getContextWords($word_ids, $word_index);
+ $i = 0;
+ foreach ($context_words as $word) {
+ $power = pow($distances[$i] / $standard_deviation, 2);
+ $distance = exp(-1 * $power);
+ list($index, $sign) = $this->getHashIndex(
+ $this->hash2vec_words_reverse_dictionary[$word]);
+ $this->hash2vec_vectors[$id][$index] =
+ $this->hash2vec_vectors[$id][$index] +
+ $sign * $distance;
+ $i += 1;
+ }
+ $word_index += 1;
+ }
+ }
+ $this->normalizeVectors();
+ $this->calculateSimilarityHash2Vec();
+ }
+ /**
+ * Performs normalization for the HASH2VEC vectors in order to avoid
+ * the features with less values getting neglected in calculating
+ * similarity
+ */
+ public function normalizeVectors()
+ {
+ L\crawlLog("...Normalizing Hash2Vec Vectors.");
+ for ($i = 0; $i < count($this->hash2vec_vectors); $i++) {
+ $vector = [];
+ foreach ($this->hash2vec_vectors[$i] as $value) {
+ $vector[] = abs($value);
+ }
+ $sum = array_sum($vector);
+ foreach ($vector as $index=>$value) {
+ if ($sum == 0) { $sum = 1; }
+ $this->hash2vec_vectors[$i][$index] = $value * 1. / $sum;
+ }
+ }
+ }
+ /**
+ * Calculates top 10 similar words for every word in the hash2vec words
+ * dictionary. The similarity is calculated using cosine coefficient
+ * between the corresponding vectors of two words
+ */
+ public function calculateSimilarityHash2Vec()
+ {
+ L\crawlLog("...Generating Hash2Vec Similarity Score.");
+ $db=$this->db;
+ $base_sql = "INSERT INTO HASH2VEC_TERM_SIMILARITY VALUES";
+ $insert_sql = $base_sql;
+ $insert_count = 0;
+ $comma = "";
+ foreach ($this->hash2vec_words_reverse_dictionary as $id => $word) {
+ $similar_words = $this->getSimilarWords($word);
+ if (!empty($similar_words)) {
+ $word_hash = floor(bindec(str_replace(" ", "",
+ L\toBinString(hash("crc32b", strtolower($word), true))))/2);
+ $db->execute("DELETE FROM HASH2VEC_TERM_SIMILARITY WHERE
+ TERM1 = " . $word_hash);
+ foreach ($similar_words as $item) {
+ $term_hash = floor(bindec(str_replace(" ", "",
+ L\toBinString(hash("crc32b", $item[0], true))))/2);
+ $score = $item[1];
+ $insert_sql .= "$comma ($word_hash, $term_hash,
+ $score)";
+ $comma = ",";
+ $insert_count++;
+ if ($insert_count == self::BATCH_SQL_INSERT_NUM) {
+ $insert_ignore_sql = $db->insertIgnore($insert_sql);
+ $db->execute($insert_ignore_sql);
+ $insert_sql = $base_sql;
+ $insert_count = 0;
+ $comma = "";
+ }
+ }
+ }
+ }
+ if ($insert_count > 0) {
+ $insert_ignore_sql = $db->insertIgnore($insert_sql);
+ $db->execute($insert_ignore_sql);
+ }
+ }
+ /**
+ * Calculates hash score for all the words in the dictionary with given
+ * word and finds defined number of similar words
+ *
+ * @param string $word word for which similar words are to be calculated
+ * @return array array of similar words and similarity score
+ */
+ public function getSimilarWords($word) {
+ $word = strtolower($word);
+ if (!array_key_exists($word, $this->hash2vec_words_dictionary)) {
+ return NULL;
+ }
+ $word_id = $this->hash2vec_words_dictionary[$word];
+ $word_vector = $this->hash2vec_vectors[$word_id];
+ $heap = new \SplMinHeap();
+ foreach ($this->hash2vec_vectors as $index => $vector) {
+ if ($index == $word_id) {
+ continue;
+ }
+ $power = [];
+ foreach ($vector as $value) {
+ $power[] = pow($value, 2);
+ };
+ $sum = array_sum($power);
+ if ($sum == 0) {
+ $sum = 1;
+ }
+ $root_sum = sqrt($sum);
+ $score = $this->dotProduct($word_vector, $vector) / $root_sum;
+ if ($heap->count() < self::MAX_SIMILAR_WORDS) {
+ $heap->insert([$score, $index]);
+ } else if ($heap->top()[0] < $score) {
+ $heap->extract();
+ $heap->insert([$score, $index]);
+ }
+ }
+ $similar_words = [];
+ for ($heap->top(); $heap->valid(); $heap->next()) {
+ $word = $heap->current();
+ array_push($similar_words, [$this->hash2vec_words_reverse_dictionary
+ [$word[1]], $word[0]]);
+ }
+ return $similar_words;
+ }
+ /**
+ * Calculates statistical standard deviation of given data elements
+ *
+ * @param array $data array of elements
+ * @return float standard deviation
+ */
+ public function calculateStandardDeviation($data)
+ {
+ $average = round(array_sum($data) / count($data), 1);
+ $differences = [];
+ foreach ($data as $value) {
+ $difference = $value - $average;
+ $differences[] = pow($difference, 2);
+ }
+ $sum = array_sum($differences);
+ $variance = $sum / count($differences);
+ $standard_deviation = sqrt($variance);
+ return $standard_deviation;
+ }
+ /**
+ * Insert a word to dictionary of words and assign a id
+ *
+ * @param string $word word to insert into dictionary
+ * @param int $id id of the word
+ * @return int id assigned to the word
+ */
+ public function wordToId($word, $id)
+ {
+ if (!array_key_exists($word, $this->hash2vec_words_dictionary)) {
+ $this->hash2vec_words_dictionary[$word] = $id;
+ $this->hash2vec_words_reverse_dictionary[$id] = $word;
+ $this->hash2vec_vectors[] =
+ array_fill(0,self::HASH2VEC_VECTOR_LENGTH,0);
+ }
+ return $this->hash2vec_words_dictionary[$word];
+ }
+ /**
+ * Generates appropriate context window of words for the given word
+ *
+ * @param array $word_ids ids of words in current data line
+ * @param int index the index of word for which the context window is
+ * calculated
+ * @return array of context words and their distances from given word
+ */
+ public function getContextWords($word_ids, $index) {
+ $start_idx = 0;
+ $end_idx = $index + 1 + self::CONTEXT_WIDTH;
+ if ($index > self::CONTEXT_WIDTH) {
+ $start_idx = $index - self::CONTEXT_WIDTH;
+ }
+ if ($end_idx > count($word_ids)) { $end_idx = count($word_ids); }
+ $prefix = array_slice($word_ids, $start_idx, $index-$start_idx);
+ if ($index <= self::CONTEXT_WIDTH) {
+ $suffix = array_slice($word_ids,$index + 1,
+ $end_idx - $index - $start_idx - 1);
+ } else if ($index >= (count($word_ids) - self::CONTEXT_WIDTH)) {
+ $suffix = array_slice($word_ids, $index + 1, $end_idx - $index - 1);
+ } else {
+ $suffix = array_slice($word_ids, $index + 1, self::CONTEXT_WIDTH);
+ }
+ $context_words = array_merge($prefix, $suffix);
+ if ($index - $start_idx == 0) {
+ $prefix = [];
+ } else {
+ $prefix = range(1, $index - $start_idx);
+ }
+ if ($index <= self::CONTEXT_WIDTH) {
+ if ($end_idx - $index - $start_idx - 1 == 0) {
+ $suffix = [];
+ } else {
+ $suffix = range($end_idx - $index - $start_idx - 1, 1, -1);
+ }
+ } else if ($index == count($word_ids) - 1) {
+ $suffix = [];
+ } else {
+ $suffix = range($end_idx - $index - 1, 1, -1);
+ }
+ $distance = array_merge($prefix, $suffix);
+ assert(count($distance) == count($context_words));
+ return [$context_words, $distance];
+ }
+ /**
+ * Calculates the index in HASH2VEC vector of the word where the given
+ * word's context should be written using md5 hash value of the given word
+ *
+ * @param string $word whose hash value is to be calculated
+ * @return array of index and sign of hash
+ */
+ public function getHashIndex($word)
+ {
+ $hash = unpack("N", substr(md5($word), 0, 4))[1];
+ $index = $hash % self::HASH2VEC_VECTOR_LENGTH;
+ $sign = $hash % 2 ? 1 : -1;
+ return [$index, $sign];
+ }
+ /**
+ * Performs dot product operation on two vectors
+ *
+ * @param array $vector1 array representing vector 1 elements
+ * @param array $vector2 array representing vector 2 elements
+ * @return float product of two vectors
+ */
+ public function dotProduct($vector1, $vector2) {
+ $product = 0;
+ for ($i = 0; $i < count($vector1); $i++) {
+ $product = $product + $vector1[$i] * $vector2[$i];
+ }
+ return $product;
+ }
}
diff --git a/src/models/ProfileModel.php b/src/models/ProfileModel.php
index 3ba5f356c..0e507a22b 100755
--- a/src/models/ProfileModel.php
+++ b/src/models/ProfileModel.php
@@ -390,6 +390,16 @@ class ProfileModel extends Model
ACCESS_COUNT $integer,
PRIMARY KEY(ADDRESS, PAGE_NAME))",
"VERSION" => "CREATE TABLE VERSION(ID $integer PRIMARY KEY)",
+ "USER_TERM_WEIGHTS_HASH2VEC" => "CREATE TABLE
+ USER_TERM_WEIGHTS_HASH2VEC(TERM_ID $integer, USER_ID $integer,
+ WEIGHT FLOAT, PRIMARY KEY(TERM_ID, USER_ID))",
+ "USER_ITEM_SIMILARITY_HASH2VEC" => "CREATE TABLE
+ USER_ITEM_SIMILARITY_HASH2VEC(USER_ID $integer, THREAD_ID
+ $integer, SIMILARITY FLOAT, GROUP_MEMBER $integer,
+ PRIMARY KEY(USER_ID, THREAD_ID))",
+ "HASH2VEC_TERM_SIMILARITY" => "CREATE TABLE HASH2VEC_TERM_SIMILARITY
+ (TERM1 $integer, TERM2 $integer, SCORE FLOAT,
+ PRIMARY KEY(TERM1, TERM2))",
];
}
/**