Bigram support

Ravi [2012-01-24 09:Jan:th]

Bigram support

Signed-off-by: Chris Pollett <chris@pollett.org>

Filename
configs/bigram_builder.php
lib/bigrams.php

diff --git a/configs/bigram_builder.php b/configs/bigram_builder.php
index fbbe560ca..a349c4dba 100644
--- a/configs/bigram_builder.php
+++ b/configs/bigram_builder.php
@@ -22,6 +22,48 @@
  *
  *  END LICENSE
  *
+ * Bigrams are pair of words which always occur together in the same
+ * sequence in a user query, ex: "honda accord". Yioop treates these
+ * pair of words as a single word to increse the speed and efficiency
+ * of retrieval. This script can be used to create a bigrams filter
+ * file for the yioop search engine to filter such words from documents
+ * and queries. The input to this script is an xml file which contains
+ * a large collection of such bigrams. The most common source of a large
+ * set of bigrams in an xml file is wikipedia dumps. Wikipedia dumps are
+ * available for downloaded online free of cost. Typically the bigrams
+ * filter file is specific to a language, therefore user has to create a
+ * separate filter file for each language to use this functionality. This
+ * script can be run multiple times to create a different filter file by
+ * specifying a different input xml and a different language as the
+ * command line arguments. The xml dumps of wikipedia specific to
+ * different languages are available to download. These language specific
+ * dumps of wikipedia as xml file serve as input to this script.
+ *
+ * For example the user can follow the following steps to create a bigrams
+ * filter for english langauge.
+ *
+ * Step 1): Go to link http://dumps.wikimedia.org/enwiki/ which is source
+ * of dumps for english wikipedia. This page lists all the dumps according
+ * to date they were taken. Choose any suitable date or the latest. Say we
+ * chose 20120104/, dumps taken on 01/04/2012. This would take you to the
+ * page which has many links based on type of content you are looking for.
+ * We are interested in content titled
+ * "Recobine all pages, current versions only" with the link
+ * "enwiki-20120104-pages-meta-current.xml.bz2"
+ * This is a bz2 compressed xml file containing all the english pages of
+ * wikipedia. Download the file to the "serach_filters" folder of your
+ * yioop work directory associated with your profile.
+ * (Note: You should have sufficient hard disk space in the order of
+ *        100GB to store the compressed dump and script extracted xml.
+ *        The script also accepts an uncompressed XML file as input.
+ *        The filter file generated is a few megabytes.)
+ *
+ * Step 2): Run this script from the php command line as below
+ * php bigram_builder enwiki-20120104-pages-meta-current.xml.bz2 en
+ *
+ * This would create a bigram filter for english in the same directory.
+ *
+ *
  * @author Ravi Dhillon  ravi.dhillon@yahoo.com
  * @package seek_quarry
  * @license http://www.gnu.org/licenses/ GPL3
@@ -36,12 +78,13 @@ ini_set("memory_limit","1024M");

 if(count($argv) != 3){
     echo "bigram_builder is used to create a bigram filter file for the \n".
-        "Yioop! search engine. This filter file is used to detect when two \n";
+        "Yioop! search engine. This filter file is used to detect when two \n".
         "words in a language should be treated as a unit. For example, \n".
         "Bill Clinton. bigram_builder is run from the command line as:\n".
         "php bigram.php wiki_xml lang\n".
-        "where wiki_xml is a wikimedia xml file whose urls will be used to\n"
-        "determine the bigrams and lang is an IANA language tag."
+        "where wiki_xml is a wikipedia xml file or a bz2 compressed xml\n".
+        "file whose urls will be used to determine the bigrams and lang\n".
+        "is an IANA language tag.";
     exit();
 }

diff --git a/lib/bigrams.php b/lib/bigrams.php
index 3d1b4315c..7a5716a18 100644
--- a/lib/bigrams.php
+++ b/lib/bigrams.php
@@ -62,10 +62,10 @@ require_once BASE_DIR."/lib/phrase_parser.php";
 class Bigrams
 {

-    /**
-     * Language tags and their corresponding bigram prefix
-     * @var array
-     */
+     /**
+      * Language tags and their corresponding bigram prefix
+      * @var array
+      */
      static $LANG_PREFIX = array(
         'en' => "en",
         'en-US' => "en",
@@ -73,12 +73,31 @@ class Bigrams
         'en-CA' => "en"
      );

+     /**
+      * @constant Name of the folder inside user work directory
+      * that contains the input compressed XML file. The filter
+      * file generated will also be stored in this folder.
+      */
      const FILTER_FOLDER = "/search_filters/";
+     /**
+      * @constant Suffix appended to langauge tag to create the
+      * filter file name containing bigrams.
+      */
      const FILTER_SUFFIX = "_bigrams.ftr";
+     /**
+      * @constant Suffix appended to langauge tag to create the
+      * text file name containing bigrams.
+      */
      const TEXT_SUFFIX = "_bigrams.txt";

     /**
-     * Extracts Bigrams from input set of phrases.
+     * Extracts Bigrams from input set of phrases. If a filter file
+     * is not available for $lang we just return the input phrases.
+     * Each pair of phrases is searched in filter file to check if
+     * it is a bigram. If a pair passes the bigram check we add it
+     * to the array of phrases as a single phrase otherwise individual
+     * phrases are added to the array. The resultant array of phrases
+     * is returned at the end.
      *
      * @param array $phrases subject to bigram check
      * @param string $lang Language to be used to stem bigrams.
@@ -90,7 +109,7 @@ class Bigrams
         if(isset(self::$LANG_PREFIX[$lang])) {
             $lang_prefix = self::$LANG_PREFIX[$lang];
         }
-        $filter_path = WORK_DIRECTORY .
+        $filter_path = WORK_DIRECTORY .
             self::FILTER_FOLDER . $lang_prefix . self::FILTER_SUFFIX;
         if (file_exists($filter_path)) {
             $bigrams = BloomFilterFile::load($filter_path);
@@ -124,7 +143,12 @@ class Bigrams


     /**
-     * Creates a bloom filter file from a bigram text file
+     * Creates a bloom filter file from a bigram text file. The
+     * path of bigram text file used is based on the input $lang.
+     * The name of output filter file is based on the $lang and
+     * size based on input number of bigrams .
+     * The bigrams are read from text file, stemmed if a stemmer
+     * is available for $lang and then stored in filter file.
      *
      * @param string $lang language to be used to stem bigrams.
      * @param number $num_bigrams Count of bigrams in text file.
@@ -136,8 +160,8 @@ class Bigrams
         if(isset(self::$LANG_PREFIX[$lang])) {
             $lang_prefix = self::$LANG_PREFIX[$lang];
         }
-        $filter_path =
-            WORK_DIRECTORY . self::FILTER_FOLDER . $lang_prefix .
+        $filter_path =
+            WORK_DIRECTORY . self::FILTER_FOLDER . $lang_prefix .
             self::FILTER_SUFFIX;
         if (file_exists($filter_path)) {
             $bigrams = BloomFilterFile::load($filter_path);
@@ -146,7 +170,7 @@ class Bigrams
             $bigrams = new BloomFilterFile($filter_path, $num_bigrams);
         }

-        $inputFilePath = WORK_DIRECTORY . self::FILTER_FOLDER .
+        $inputFilePath = WORK_DIRECTORY . self::FILTER_FOLDER .
             $lang_prefix . self::TEXT_SUFFIX;
         $fp = fopen($inputFilePath, 'r') or die("Can't open bigrams text file");
         while ( ($bigram = fgets($fp)) !== false) {
@@ -164,10 +188,19 @@ class Bigrams
     }

     /**
-     * Generates a bigrams text file from input wikimedia xml file.
+     * Generates a bigrams text file from input wikipedia xml file.
+     * The input file can be a bz2 compressed or uncompressed, if
+     * the file is compressed it is uncompressed by calling the function
+     * uncompressBz2File($compressed_wiki_file_path).
+     * The input XML file is parsed line by line and pattern for
+     * bigram is searched. If a bigram is found it is added to the
+     * array. After the complete file is parsed we remove the duplicate
+     * bigrams and sort them. The resulting array is written to the
+     * text file. The function returns the number of bigrams stored in
+     * the text file.
      *
-     * @param string $wiki_file wikimedia XML file name to be used to
-     *      extract bigrams.
+     * @param string $wiki_file compressed or uncompressed wikipedia
+     *      XML file path to be used to extract bigrams.
      * @param string $lang Language to be used to create bigrams.
      * @return number $num_bigrams Count of bigrams in text file.
      */
@@ -177,9 +210,18 @@ class Bigrams
         if(isset(self::$LANG_PREFIX[$lang])) {
             $lang_prefix = self::$LANG_PREFIX[$lang];
         }
-        $wiki_file_path = WORK_DIRECTORY.self::FILTER_FOLDER.$wiki_file;
+        $compressed_wiki_file_path =
+            WORK_DIRECTORY.self::FILTER_FOLDER.$wiki_file;
+        $found = strpos($compressed_wiki_file_path, "bz2");
+        if($found == false){
+            $wiki_file_path = $compressed_wiki_file_path;
+        }
+        else{
+            $wiki_file_path =
+                self::uncompressBz2File($compressed_wiki_file_path);
+        }
         $fr = fopen($wiki_file_path, 'r') or die("Can't open xml file");
-        $bigrams_file_path
+        $bigrams_file_path
             = WORK_DIRECTORY.self::FILTER_FOLDER.$lang_prefix.self::TEXT_SUFFIX;
         $fw = fopen($bigrams_file_path, 'w') or die("Can't open text file");
         $bigrams = array();
@@ -203,4 +245,37 @@ class Bigrams
         fclose($fw);
         return $num_bigrams;
     }
+
+    /**
+     * Uncompress the compressed Bz2 xml file specified by input
+     * parameter $compressed_wiki_file_path. The $buffer_size
+     * variable specifies the size of block which is read in one
+     * iteration from the compressed file. The uncompressed xml
+     * file is stored in the same directory as the compressed file.
+     * The name of this file is generated by removing ".bz2" from
+     * the end of compressed file name. The name of uncompressed
+     * file is returned by the function.
+     *
+     * @param string $compressed_wiki_file_path bz2 compressed
+     *     wikipedia XML file path.
+     * @return string $wiki_file_path Uncompressed xml file path.
+     */
+    static function uncompressBz2File($compressed_wiki_file_path)
+    {
+        $wiki_file_path = str_replace('.bz2', '', $compressed_wiki_file_path);
+        $bz = bzopen($compressed_wiki_file_path, 'r');
+        $out_file = fopen($wiki_file_path, 'w');
+        $buffer_size = 8092;
+
+        do {
+            $block = bzread($bz, $buffer_size);
+            if($block!==false)
+                fwrite($out_file, $block);
+        }
+        while($block);
+
+        fclose($out_file);
+        bzclose($bz);
+        return $wiki_file_path;
+    }
 }

ViewGit