Fixes Issue 240, Refactored Code for Hash2Vec approach, r=chris

Parth Patel [2022-05-11 00:May:th]
Fixes Issue 240, Refactored Code for Hash2Vec approach, r=chris

Signed-off-by: Chris Pollett <chris@pollett.org>
Filename
src/configs/Config.php
src/library/VersionFunctions.php
src/library/media_jobs/RecommendationJob.php
src/models/ProfileModel.php
diff --git a/src/configs/Config.php b/src/configs/Config.php
index 380c9c2ad..f02b2f5d9 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -162,7 +162,7 @@ nsconddefine('GENERATOR_STRING', "Yioop");
  * Version number for upgrade database function
  * @var int
  */
-nsdefine('DATABASE_VERSION', 72);
+nsdefine('DATABASE_VERSION', 73);
 /**
  * Minimum Version fo Yioop for which keyword ad script
  * still works with this version
@@ -1225,3 +1225,5 @@ nsconddefine('SENTENCE_COMPRESSION_ENABLED', false);
 nsconddefine('NUM_LEX_BULK_INSERTS',100000);
 /** Length of advertisement credits service account id string*/
 nsconddefine('AD_CREDITS_SERVICE_ACCOUNT_LEN', 32);
+/** Type used to indicate recommendation scheme */
+nsdefine('RECOMMENDATION_TYPE', 1);
\ No newline at end of file
diff --git a/src/library/VersionFunctions.php b/src/library/VersionFunctions.php
index 42e6c1bd9..b37a3b902 100644
--- a/src/library/VersionFunctions.php
+++ b/src/library/VersionFunctions.php
@@ -2019,3 +2019,23 @@ function upgradeDatabaseVersion72(&$db)
     $db->execute("ALTER TABLE USERS ADD COLUMN AD_CREDITS_SERVICE_ACCOUNT
         VARCHAR(" . C\AD_CREDITS_SERVICE_ACCOUNT_LEN . ")");
 }
+/**
+ * Upgrades a Version 72 version of the Yioop database to a Version 73 version
+ * @param object $db datasource to use to upgrade
+ */
+function upgradeDatabaseVersion73(&$db)
+{
+    $dbinfo = ["DBMS" => C\DBMS, "DB_HOST" => C\DB_HOST,
+        "DB_NAME" => C\DB_NAME, "DB_PASSWORD" => C\DB_PASSWORD];
+    $integer = $db->integerType($dbinfo);
+    $db->execute("DROP TABLE IF EXISTS USER_TERM_WEIGHTS_HASH2VEC");
+    $db->execute("DROP TABLE IF EXISTS USER_ITEM_SIMILARITY_HASH2VEC");
+    $db->execute("DROP TABLE IF EXISTS HASH2VEC_TERM_SIMILARITY");
+    $db->execute("CREATE TABLE USER_TERM_WEIGHTS_HASH2VEC(TERM_ID $integer,
+        USER_ID $integer, WEIGHT FLOAT, PRIMARY KEY(TERM_ID, USER_ID))");
+    $db->execute("CREATE TABLE USER_ITEM_SIMILARITY_HASH2VEC(USER_ID $integer,
+        THREAD_ID $integer, SIMILARITY FLOAT, GROUP_MEMBER $integer,
+        PRIMARY KEY(USER_ID, THREAD_ID))");
+    $db->execute("CREATE TABLE HASH2VEC_TERM_SIMILARITY(TERM1 $integer,
+        TERM2 $integer, SCORE FLOAT, PRIMARY KEY(TERM1, TERM2))");
+}
diff --git a/src/library/media_jobs/RecommendationJob.php b/src/library/media_jobs/RecommendationJob.php
index 2ea4dbff0..db6758dc2 100644
--- a/src/library/media_jobs/RecommendationJob.php
+++ b/src/library/media_jobs/RecommendationJob.php
@@ -76,6 +76,36 @@ class RecommendationJob extends MediaJob
      * Maximum number of terms used in making recommendations
      */
     const MAX_TERMS = 20000;
+    /**
+     * Regular expression pattern to clean words in data corpus
+     * for HASH2VEC approach
+     */
+    const WORD_SPLIT_PATTERN = "/([.,!?\"':;)(])+/";
+    /**
+     * Dimensions of HASH2VEC vectors generated for words in data corpus
+     */
+    const HASH2VEC_VECTOR_LENGTH = 200;
+    /**
+     * Length of context window for HASH2VEC similarity calculation
+     */
+    const CONTEXT_WIDTH = 5;
+    /**
+     * Number of similar words to keep for a word in HASH2VEC
+     */
+    const MAX_SIMILAR_WORDS = 10;
+    /**
+     * Array of the generated HASH2VEC vectors for words in the data corpus
+     * @var array
+     */
+    public $hash2vec_vectors = [];
+    /**
+     * Associative array of words in the data corpus for HASH2VEC
+     */
+    public $hash2vec_words_dictionary = [];
+    /**
+     * Associative array of number of word in the data corpus for HASH2VEC
+     */
+    public $hash2vec_words_reverse_dictionary = [];
     /**
      * Sets up the database connection so can access tables related
      * to recommendations. Initialize timing info related to job.
@@ -200,6 +230,9 @@ class RecommendationJob extends MediaJob
         $this->computeUserItemIdf($number_items, $number_users);
         $this->tfIdfUsers();
         $this->tfIdfItems();
+        if (C\RECOMMENDATION_TYPE == 1) {
+            $this->initializeHash2Vec();
+        }
         $this->computeUserItemSimilarity();
         $not_belongs_subselect =  "NOT EXISTS (SELECT * FROM ".
             "GROUP_ITEM B WHERE S.USER_ID=B.USER_ID ".
@@ -491,14 +524,32 @@ class RecommendationJob extends MediaJob
     {
         L\crawlLog("...Computing User Item Similarity Scores.");
         $db = $this->db;
+        $user_weight_table = "USER_TERM_WEIGHTS";
+        if (C\RECOMMENDATION_TYPE == 1) {
+            $this->db->execute("INSERT INTO USER_TERM_WEIGHTS_HASH2VEC
+                SELECT * FROM USER_TERM_WEIGHTS");
+            $this->db->execute("UPDATE USER_TERM_WEIGHTS_HASH2VEC
+                SET WEIGHT=COALESCE((SELECT SUM(HTS.SCORE*UTW.WEIGHT)
+                FROM HASH2VEC_TERM_SIMILARITY HTS, USER_TERM_WEIGHTS UTW
+                WHERE HTS.TERM2 = UTW.TERM_ID AND HTS.TERM1 =
+                USER_TERM_WEIGHTS_HASH2VEC.TERM_ID),0)");
+            $this->db->execute("UPDATE USER_TERM_WEIGHTS_HASH2VEC
+                SET WEIGHT=WEIGHT+COALESCE((SELECT UTW.WEIGHT
+                FROM USER_TERM_WEIGHTS UTW WHERE
+                USER_TERM_WEIGHTS_HASH2VEC.TERM_ID = UTW.TERM_ID
+                AND USER_TERM_WEIGHTS_HASH2VEC.USER_ID = UTW.USER_ID),0)");
+            $this->db->execute("INSERT INTO USER_ITEM_SIMILARITY_HASH2VEC
+                SELECT * FROM USER_ITEM_SIMILARITY");
+            $user_weight_table = "USER_TERM_WEIGHTS_HASH2VEC";
+        }
         $similarity_parts_sql =
             "SELECT SUM(UTW.WEIGHT * ITW.WEIGHT) AS THREAD_DOT_USER, ".
             "SUM(UTW.WEIGHT * UTW.WEIGHT) AS USER_MAG," .
             "SUM(ITW.WEIGHT * ITW.WEIGHT) AS ITEM_MAG," .
             "GI.PARENT_ID AS THREAD_ID, UTW.USER_ID AS USER_ID ".
-            "FROM ITEM_TERM_WEIGHTS ITW, USER_TERM_WEIGHTS UTW, GROUP_ITEM GI ".
-            "WHERE GI.ID = ITW.ITEM_ID AND UTW.TERM_ID=ITW.TERM_ID " .
-            "GROUP BY UTW.USER_ID, GI.PARENT_ID";
+            "FROM ITEM_TERM_WEIGHTS ITW, $user_weight_table UTW, ".
+            "GROUP_ITEM GI WHERE GI.ID = ITW.ITEM_ID AND ".
+            "UTW.TERM_ID=ITW.TERM_ID GROUP BY UTW.USER_ID, GI.PARENT_ID";
         $similarity_parts_result = $db->execute($similarity_parts_sql);
         //used to check if belong to group
         $member_info_sql = "SELECT GI.GROUP_ID FROM ".
@@ -618,4 +669,302 @@ class RecommendationJob extends MediaJob
             $db->execute($insert_ignore_sql);
         }
     }
+    /**
+     * Initializes data corpus for HASH2VEC recommendation approach. The data
+     * consists of concated title and description text of the group items
+     * separated by new line character for previous item
+     */
+    public function initializeHash2Vec()
+    {
+        L\crawlLog("...Initializing Hash2Vec.");
+        $db = $this->db;
+        $data_corpus = "";
+        $group_item_sql = "SELECT ID AS ITEM_ID, TITLE, DESCRIPTION ".
+            "FROM GROUP_ITEM ".
+            "WHERE LOWER(TITLE) NOT LIKE '%page%'" .
+            "AND LOWER(DESCRIPTION) NOT LIKE '%-0700%'" .
+            "ORDER BY PUBDATE DESC " . $db->limitOffset(self::MAX_GROUP_ITEMS);
+        $results = $db->execute($group_item_sql);
+        while ($item = $db->fetchArray($results)) {
+            $data_corpus .= $item['TITLE']. " ";
+            $data_corpus .= $item['DESCRIPTION'] . "\n";
+        }
+        $this->generateVectors($data_corpus);
+    }
+    /**
+     * Generates the HASH2VEC vectors for words in the given data corpus
+     *
+     * @param string $data_corpus the data corpus of group items in the form
+     *      title + description for each item per line
+     */
+    public function generateVectors($data_corpus)
+    {
+        L\crawlLog("...Generating Hash2Vec Vectors.");
+        for ($i=0; $i<self::CONTEXT_WIDTH; $i++) {
+            $context_distance_vector[] = -$i + self::CONTEXT_WIDTH;
+        }
+        for ($i=0; $i<self::CONTEXT_WIDTH; $i++) {
+            $context_distance_vector[] = $i;
+        }
+        $standard_deviation = $this->calculateStandardDeviation(
+            $context_distance_vector);
+        $word_id = 0;
+        $data_lines = explode("\n", strtolower($data_corpus));
+        foreach ($data_lines as $line) {
+            $line = preg_replace("/[\n\r]/",'',$line);
+            if (strlen($line) == 0) {
+                continue;
+            }
+            $words = explode(" ", strtolower($line));
+            if (count($words) <= 1) { continue; }
+            $clean_words = [];
+            foreach ($words as $word) {
+                if ($word){
+                    $clean_word = preg_replace(
+                        self::WORD_SPLIT_PATTERN, '', $word);
+                    $clean_words[] = $clean_word;
+                }
+            }
+            $word_ids = [];
+            foreach ($clean_words as $word) {
+                $word_ids[] = $this->wordToId($word, $word_id);
+                $word_id += 1;
+            }
+            $word_index = 0;
+            foreach ($word_ids as $id) {
+                list($context_words, $distances) =
+                    $this->getContextWords($word_ids, $word_index);
+                $i = 0;
+                foreach ($context_words as $word) {
+                    $power = pow($distances[$i] / $standard_deviation, 2);
+                    $distance = exp(-1 * $power);
+                    list($index, $sign) = $this->getHashIndex(
+                        $this->hash2vec_words_reverse_dictionary[$word]);
+                    $this->hash2vec_vectors[$id][$index] =
+                        $this->hash2vec_vectors[$id][$index] +
+                        $sign * $distance;
+                    $i += 1;
+                }
+                $word_index += 1;
+            }
+        }
+        $this->normalizeVectors();
+        $this->calculateSimilarityHash2Vec();
+    }
+    /**
+     * Performs normalization for the HASH2VEC vectors in order to avoid
+     * the features with less values getting neglected in calculating
+     * similarity
+     */
+    public function normalizeVectors()
+    {
+        L\crawlLog("...Normalizing Hash2Vec Vectors.");
+        for ($i = 0; $i < count($this->hash2vec_vectors); $i++) {
+            $vector = [];
+            foreach ($this->hash2vec_vectors[$i] as $value) {
+                $vector[] = abs($value);
+            }
+            $sum = array_sum($vector);
+            foreach ($vector as $index=>$value) {
+                if ($sum == 0) { $sum = 1; }
+                $this->hash2vec_vectors[$i][$index] = $value * 1. / $sum;
+            }
+        }
+    }
+    /**
+     * Calculates top 10 similar words for every word in the hash2vec words
+     * dictionary. The similarity is calculated using cosine coefficient
+     * between the corresponding vectors of two words
+     */
+    public function calculateSimilarityHash2Vec()
+    {
+        L\crawlLog("...Generating Hash2Vec Similarity Score.");
+        $db=$this->db;
+        $base_sql = "INSERT INTO HASH2VEC_TERM_SIMILARITY VALUES";
+        $insert_sql = $base_sql;
+        $insert_count = 0;
+        $comma = "";
+        foreach ($this->hash2vec_words_reverse_dictionary as $id => $word) {
+            $similar_words = $this->getSimilarWords($word);
+            if (!empty($similar_words)) {
+                $word_hash = floor(bindec(str_replace(" ", "",
+                    L\toBinString(hash("crc32b", strtolower($word), true))))/2);
+                $db->execute("DELETE FROM HASH2VEC_TERM_SIMILARITY WHERE
+                    TERM1 = " . $word_hash);
+                foreach ($similar_words as $item) {
+                    $term_hash = floor(bindec(str_replace(" ", "",
+                        L\toBinString(hash("crc32b", $item[0], true))))/2);
+                    $score = $item[1];
+                    $insert_sql .= "$comma ($word_hash, $term_hash,
+                        $score)";
+                    $comma = ",";
+                    $insert_count++;
+                    if ($insert_count == self::BATCH_SQL_INSERT_NUM) {
+                        $insert_ignore_sql = $db->insertIgnore($insert_sql);
+                        $db->execute($insert_ignore_sql);
+                        $insert_sql = $base_sql;
+                        $insert_count = 0;
+                        $comma = "";
+                    }
+                }
+            }
+        }
+        if ($insert_count > 0) {
+            $insert_ignore_sql = $db->insertIgnore($insert_sql);
+            $db->execute($insert_ignore_sql);
+        }
+    }
+    /**
+     * Calculates hash score for all the words in the dictionary with given
+     * word and finds defined number of similar words
+     *
+     * @param string $word word for which similar words are to be calculated
+     * @return array array of similar words and similarity score
+     */
+    public function getSimilarWords($word) {
+        $word = strtolower($word);
+        if (!array_key_exists($word, $this->hash2vec_words_dictionary)) {
+            return NULL;
+        }
+        $word_id = $this->hash2vec_words_dictionary[$word];
+        $word_vector = $this->hash2vec_vectors[$word_id];
+        $heap = new \SplMinHeap();
+        foreach ($this->hash2vec_vectors as $index => $vector) {
+            if ($index == $word_id) {
+                continue;
+            }
+            $power = [];
+            foreach ($vector as $value) {
+                $power[] = pow($value, 2);
+            };
+            $sum = array_sum($power);
+            if ($sum == 0) {
+                $sum = 1;
+            }
+            $root_sum = sqrt($sum);
+            $score = $this->dotProduct($word_vector, $vector) / $root_sum;
+            if ($heap->count() < self::MAX_SIMILAR_WORDS) {
+                $heap->insert([$score, $index]);
+            } else if ($heap->top()[0] < $score) {
+                $heap->extract();
+                $heap->insert([$score, $index]);
+            }
+        }
+        $similar_words = [];
+        for ($heap->top(); $heap->valid(); $heap->next()) {
+            $word = $heap->current();
+            array_push($similar_words, [$this->hash2vec_words_reverse_dictionary
+                [$word[1]], $word[0]]);
+        }
+        return $similar_words;
+    }
+    /**
+     * Calculates statistical standard deviation of given data elements
+     *
+     * @param array $data array of elements
+     * @return float standard deviation
+     */
+    public function calculateStandardDeviation($data)
+    {
+        $average = round(array_sum($data) / count($data), 1);
+        $differences = [];
+        foreach ($data as $value) {
+            $difference = $value - $average;
+            $differences[] = pow($difference, 2);
+        }
+        $sum = array_sum($differences);
+        $variance = $sum / count($differences);
+        $standard_deviation = sqrt($variance);
+        return $standard_deviation;
+    }
+    /**
+     * Insert a word to dictionary of words and assign a id
+     *
+     * @param string $word word to insert into dictionary
+     * @param int $id id of the word
+     * @return int id assigned to the word
+     */
+    public function wordToId($word, $id)
+    {
+        if (!array_key_exists($word, $this->hash2vec_words_dictionary)) {
+            $this->hash2vec_words_dictionary[$word] = $id;
+            $this->hash2vec_words_reverse_dictionary[$id] = $word;
+            $this->hash2vec_vectors[] =
+                array_fill(0,self::HASH2VEC_VECTOR_LENGTH,0);
+        }
+        return $this->hash2vec_words_dictionary[$word];
+    }
+    /**
+     * Generates appropriate context window of words for the given word
+     *
+     * @param array $word_ids ids of words in current data line
+     * @param int index the index of word for which the context window is
+     *      calculated
+     * @return array of context words and their distances from given word
+     */
+    public function getContextWords($word_ids, $index) {
+        $start_idx = 0;
+        $end_idx = $index + 1 + self::CONTEXT_WIDTH;
+        if ($index > self::CONTEXT_WIDTH) {
+            $start_idx = $index - self::CONTEXT_WIDTH;
+        }
+        if ($end_idx > count($word_ids)) { $end_idx = count($word_ids); }
+        $prefix = array_slice($word_ids, $start_idx, $index-$start_idx);
+        if ($index <= self::CONTEXT_WIDTH) {
+            $suffix = array_slice($word_ids,$index + 1,
+                $end_idx - $index - $start_idx - 1);
+        } else if ($index >= (count($word_ids) - self::CONTEXT_WIDTH)) {
+            $suffix = array_slice($word_ids, $index + 1, $end_idx - $index - 1);
+        } else {
+            $suffix = array_slice($word_ids, $index + 1, self::CONTEXT_WIDTH);
+        }
+        $context_words = array_merge($prefix, $suffix);
+        if ($index - $start_idx == 0) {
+            $prefix = [];
+        } else {
+            $prefix = range(1, $index - $start_idx);
+        }
+        if ($index <= self::CONTEXT_WIDTH) {
+            if ($end_idx - $index - $start_idx - 1 == 0) {
+                $suffix = [];
+            } else {
+                $suffix = range($end_idx - $index - $start_idx - 1, 1, -1);
+            }
+        } else if ($index == count($word_ids) - 1) {
+            $suffix = [];
+        } else {
+            $suffix = range($end_idx - $index - 1, 1, -1);
+        }
+        $distance = array_merge($prefix, $suffix);
+        assert(count($distance) == count($context_words));
+        return [$context_words, $distance];
+    }
+    /**
+     * Calculates the index in HASH2VEC vector of the word where the given
+     * word's context should be written using md5 hash value of the given word
+     *
+     * @param string $word whose hash value is to be calculated
+     * @return array of index and sign of hash
+     */
+    public function getHashIndex($word)
+    {
+        $hash = unpack("N", substr(md5($word), 0, 4))[1];
+        $index = $hash % self::HASH2VEC_VECTOR_LENGTH;
+        $sign = $hash % 2 ? 1 : -1;
+        return [$index, $sign];
+    }
+    /**
+     * Performs dot product operation on two vectors
+     *
+     * @param array $vector1 array representing vector 1 elements
+     * @param array $vector2 array representing vector 2 elements
+     * @return float product of two vectors
+     */
+    public function dotProduct($vector1, $vector2) {
+        $product = 0;
+        for ($i = 0; $i < count($vector1); $i++) {
+            $product = $product + $vector1[$i] * $vector2[$i];
+        }
+        return $product;
+    }
 }
diff --git a/src/models/ProfileModel.php b/src/models/ProfileModel.php
index 3ba5f356c..0e507a22b 100755
--- a/src/models/ProfileModel.php
+++ b/src/models/ProfileModel.php
@@ -390,6 +390,16 @@ class ProfileModel extends Model
                 ACCESS_COUNT $integer,
                 PRIMARY KEY(ADDRESS, PAGE_NAME))",
             "VERSION" => "CREATE TABLE VERSION(ID $integer PRIMARY KEY)",
+            "USER_TERM_WEIGHTS_HASH2VEC" => "CREATE TABLE
+                USER_TERM_WEIGHTS_HASH2VEC(TERM_ID $integer, USER_ID $integer,
+                WEIGHT FLOAT, PRIMARY KEY(TERM_ID, USER_ID))",
+            "USER_ITEM_SIMILARITY_HASH2VEC" => "CREATE TABLE
+                USER_ITEM_SIMILARITY_HASH2VEC(USER_ID $integer, THREAD_ID
+                $integer, SIMILARITY FLOAT, GROUP_MEMBER $integer,
+                PRIMARY KEY(USER_ID, THREAD_ID))",
+            "HASH2VEC_TERM_SIMILARITY" => "CREATE TABLE HASH2VEC_TERM_SIMILARITY
+                (TERM1 $integer, TERM2 $integer, SCORE FLOAT,
+                PRIMARY KEY(TERM1, TERM2))",
             ];
     }
     /**
ViewGit