Bigram support
Ravi [2012-01-24 09:Jan:th]
Bigram support
Signed-off-by: Chris Pollett <chris@pollett.org>
diff --git a/configs/bigram_builder.php b/configs/bigram_builder.php
index fbbe560ca..a349c4dba 100644
--- a/configs/bigram_builder.php
+++ b/configs/bigram_builder.php
@@ -22,6 +22,48 @@
*
* END LICENSE
*
+ * Bigrams are pair of words which always occur together in the same
+ * sequence in a user query, ex: "honda accord". Yioop treates these
+ * pair of words as a single word to increse the speed and efficiency
+ * of retrieval. This script can be used to create a bigrams filter
+ * file for the yioop search engine to filter such words from documents
+ * and queries. The input to this script is an xml file which contains
+ * a large collection of such bigrams. The most common source of a large
+ * set of bigrams in an xml file is wikipedia dumps. Wikipedia dumps are
+ * available for downloaded online free of cost. Typically the bigrams
+ * filter file is specific to a language, therefore user has to create a
+ * separate filter file for each language to use this functionality. This
+ * script can be run multiple times to create a different filter file by
+ * specifying a different input xml and a different language as the
+ * command line arguments. The xml dumps of wikipedia specific to
+ * different languages are available to download. These language specific
+ * dumps of wikipedia as xml file serve as input to this script.
+ *
+ * For example the user can follow the following steps to create a bigrams
+ * filter for english langauge.
+ *
+ * Step 1): Go to link http://dumps.wikimedia.org/enwiki/ which is source
+ * of dumps for english wikipedia. This page lists all the dumps according
+ * to date they were taken. Choose any suitable date or the latest. Say we
+ * chose 20120104/, dumps taken on 01/04/2012. This would take you to the
+ * page which has many links based on type of content you are looking for.
+ * We are interested in content titled
+ * "Recobine all pages, current versions only" with the link
+ * "enwiki-20120104-pages-meta-current.xml.bz2"
+ * This is a bz2 compressed xml file containing all the english pages of
+ * wikipedia. Download the file to the "serach_filters" folder of your
+ * yioop work directory associated with your profile.
+ * (Note: You should have sufficient hard disk space in the order of
+ * 100GB to store the compressed dump and script extracted xml.
+ * The script also accepts an uncompressed XML file as input.
+ * The filter file generated is a few megabytes.)
+ *
+ * Step 2): Run this script from the php command line as below
+ * php bigram_builder enwiki-20120104-pages-meta-current.xml.bz2 en
+ *
+ * This would create a bigram filter for english in the same directory.
+ *
+ *
* @author Ravi Dhillon ravi.dhillon@yahoo.com
* @package seek_quarry
* @license http://www.gnu.org/licenses/ GPL3
@@ -36,12 +78,13 @@ ini_set("memory_limit","1024M");
if(count($argv) != 3){
echo "bigram_builder is used to create a bigram filter file for the \n".
- "Yioop! search engine. This filter file is used to detect when two \n";
+ "Yioop! search engine. This filter file is used to detect when two \n".
"words in a language should be treated as a unit. For example, \n".
"Bill Clinton. bigram_builder is run from the command line as:\n".
"php bigram.php wiki_xml lang\n".
- "where wiki_xml is a wikimedia xml file whose urls will be used to\n"
- "determine the bigrams and lang is an IANA language tag."
+ "where wiki_xml is a wikipedia xml file or a bz2 compressed xml\n".
+ "file whose urls will be used to determine the bigrams and lang\n".
+ "is an IANA language tag.";
exit();
}
diff --git a/lib/bigrams.php b/lib/bigrams.php
index 3d1b4315c..7a5716a18 100644
--- a/lib/bigrams.php
+++ b/lib/bigrams.php
@@ -62,10 +62,10 @@ require_once BASE_DIR."/lib/phrase_parser.php";
class Bigrams
{
- /**
- * Language tags and their corresponding bigram prefix
- * @var array
- */
+ /**
+ * Language tags and their corresponding bigram prefix
+ * @var array
+ */
static $LANG_PREFIX = array(
'en' => "en",
'en-US' => "en",
@@ -73,12 +73,31 @@ class Bigrams
'en-CA' => "en"
);
+ /**
+ * @constant Name of the folder inside user work directory
+ * that contains the input compressed XML file. The filter
+ * file generated will also be stored in this folder.
+ */
const FILTER_FOLDER = "/search_filters/";
+ /**
+ * @constant Suffix appended to langauge tag to create the
+ * filter file name containing bigrams.
+ */
const FILTER_SUFFIX = "_bigrams.ftr";
+ /**
+ * @constant Suffix appended to langauge tag to create the
+ * text file name containing bigrams.
+ */
const TEXT_SUFFIX = "_bigrams.txt";
/**
- * Extracts Bigrams from input set of phrases.
+ * Extracts Bigrams from input set of phrases. If a filter file
+ * is not available for $lang we just return the input phrases.
+ * Each pair of phrases is searched in filter file to check if
+ * it is a bigram. If a pair passes the bigram check we add it
+ * to the array of phrases as a single phrase otherwise individual
+ * phrases are added to the array. The resultant array of phrases
+ * is returned at the end.
*
* @param array $phrases subject to bigram check
* @param string $lang Language to be used to stem bigrams.
@@ -90,7 +109,7 @@ class Bigrams
if(isset(self::$LANG_PREFIX[$lang])) {
$lang_prefix = self::$LANG_PREFIX[$lang];
}
- $filter_path = WORK_DIRECTORY .
+ $filter_path = WORK_DIRECTORY .
self::FILTER_FOLDER . $lang_prefix . self::FILTER_SUFFIX;
if (file_exists($filter_path)) {
$bigrams = BloomFilterFile::load($filter_path);
@@ -124,7 +143,12 @@ class Bigrams
/**
- * Creates a bloom filter file from a bigram text file
+ * Creates a bloom filter file from a bigram text file. The
+ * path of bigram text file used is based on the input $lang.
+ * The name of output filter file is based on the $lang and
+ * size based on input number of bigrams .
+ * The bigrams are read from text file, stemmed if a stemmer
+ * is available for $lang and then stored in filter file.
*
* @param string $lang language to be used to stem bigrams.
* @param number $num_bigrams Count of bigrams in text file.
@@ -136,8 +160,8 @@ class Bigrams
if(isset(self::$LANG_PREFIX[$lang])) {
$lang_prefix = self::$LANG_PREFIX[$lang];
}
- $filter_path =
- WORK_DIRECTORY . self::FILTER_FOLDER . $lang_prefix .
+ $filter_path =
+ WORK_DIRECTORY . self::FILTER_FOLDER . $lang_prefix .
self::FILTER_SUFFIX;
if (file_exists($filter_path)) {
$bigrams = BloomFilterFile::load($filter_path);
@@ -146,7 +170,7 @@ class Bigrams
$bigrams = new BloomFilterFile($filter_path, $num_bigrams);
}
- $inputFilePath = WORK_DIRECTORY . self::FILTER_FOLDER .
+ $inputFilePath = WORK_DIRECTORY . self::FILTER_FOLDER .
$lang_prefix . self::TEXT_SUFFIX;
$fp = fopen($inputFilePath, 'r') or die("Can't open bigrams text file");
while ( ($bigram = fgets($fp)) !== false) {
@@ -164,10 +188,19 @@ class Bigrams
}
/**
- * Generates a bigrams text file from input wikimedia xml file.
+ * Generates a bigrams text file from input wikipedia xml file.
+ * The input file can be a bz2 compressed or uncompressed, if
+ * the file is compressed it is uncompressed by calling the function
+ * uncompressBz2File($compressed_wiki_file_path).
+ * The input XML file is parsed line by line and pattern for
+ * bigram is searched. If a bigram is found it is added to the
+ * array. After the complete file is parsed we remove the duplicate
+ * bigrams and sort them. The resulting array is written to the
+ * text file. The function returns the number of bigrams stored in
+ * the text file.
*
- * @param string $wiki_file wikimedia XML file name to be used to
- * extract bigrams.
+ * @param string $wiki_file compressed or uncompressed wikipedia
+ * XML file path to be used to extract bigrams.
* @param string $lang Language to be used to create bigrams.
* @return number $num_bigrams Count of bigrams in text file.
*/
@@ -177,9 +210,18 @@ class Bigrams
if(isset(self::$LANG_PREFIX[$lang])) {
$lang_prefix = self::$LANG_PREFIX[$lang];
}
- $wiki_file_path = WORK_DIRECTORY.self::FILTER_FOLDER.$wiki_file;
+ $compressed_wiki_file_path =
+ WORK_DIRECTORY.self::FILTER_FOLDER.$wiki_file;
+ $found = strpos($compressed_wiki_file_path, "bz2");
+ if($found == false){
+ $wiki_file_path = $compressed_wiki_file_path;
+ }
+ else{
+ $wiki_file_path =
+ self::uncompressBz2File($compressed_wiki_file_path);
+ }
$fr = fopen($wiki_file_path, 'r') or die("Can't open xml file");
- $bigrams_file_path
+ $bigrams_file_path
= WORK_DIRECTORY.self::FILTER_FOLDER.$lang_prefix.self::TEXT_SUFFIX;
$fw = fopen($bigrams_file_path, 'w') or die("Can't open text file");
$bigrams = array();
@@ -203,4 +245,37 @@ class Bigrams
fclose($fw);
return $num_bigrams;
}
+
+ /**
+ * Uncompress the compressed Bz2 xml file specified by input
+ * parameter $compressed_wiki_file_path. The $buffer_size
+ * variable specifies the size of block which is read in one
+ * iteration from the compressed file. The uncompressed xml
+ * file is stored in the same directory as the compressed file.
+ * The name of this file is generated by removing ".bz2" from
+ * the end of compressed file name. The name of uncompressed
+ * file is returned by the function.
+ *
+ * @param string $compressed_wiki_file_path bz2 compressed
+ * wikipedia XML file path.
+ * @return string $wiki_file_path Uncompressed xml file path.
+ */
+ static function uncompressBz2File($compressed_wiki_file_path)
+ {
+ $wiki_file_path = str_replace('.bz2', '', $compressed_wiki_file_path);
+ $bz = bzopen($compressed_wiki_file_path, 'r');
+ $out_file = fopen($wiki_file_path, 'w');
+ $buffer_size = 8092;
+
+ do {
+ $block = bzread($bz, $buffer_size);
+ if($block!==false)
+ fwrite($out_file, $block);
+ }
+ while($block);
+
+ fclose($out_file);
+ bzclose($bz);
+ return $wiki_file_path;
+ }
}