diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php
index 894b01717..ea644437a 100644
--- a/lib/index_bundle_iterators/intersect_iterator.php
+++ b/lib/index_bundle_iterators/intersect_iterator.php
@@ -1,5 +1,5 @@
<?php
-/**
+/**
* SeekQuarry/Yioop --
* Open Source Pure PHP Search Engine, Crawler, and Indexer
*
@@ -34,7 +34,7 @@
if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
-/**
+/**
*Loads base class for iterating
*/
require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php';
@@ -74,23 +74,36 @@ class IntersectIterator extends IndexBundleIterator
*/
var $to_advance_index;
+ /**
+ * An array holding iterator no corresponding to the word key
+ * @var array
+ */
+ var $word_iterator_map; //Added by Ravi Dhillon
+
+ /**
+ * Number of elements in $this->word_iterator_map
+ * @var int
+ */
+ var $num_words; //Added by Ravi Dhillon
+
/**
* Creates an intersect iterator with the given parameters.
*
* @param object $index_bundle_iterator to use as a source of documents
* to iterate over
*/
- function __construct($index_bundle_iterators)
+ function __construct($index_bundle_iterators, $word_iterator_map) //Modified by Ravi Dhillon
{
$this->index_bundle_iterators = $index_bundle_iterators;
-
+ $this->word_iterator_map = $word_iterator_map; //Added by Ravi Dhillon
+ $this->num_words = count($word_iterator_map); //Added by Ravi Dhillon
$this->num_iterators = count($index_bundle_iterators);
$this->num_docs = 0;
$this->results_per_block = 1;
-
+
/*
We take an initial guess of the num_docs we returns as the sum
- of the num_docs of the underlying iterators. We are also setting
+ of the num_docs of the underlying iterators. We are also setting
up here that we return at most one posting at a time from each
iterator
*/
@@ -158,22 +171,29 @@ class IntersectIterator extends IndexBundleIterator
$len_lists = array();
$position_lists[0] = $docs[$key][self::POSITION_LIST];
$len_lists[0] = count($docs[$key][self::POSITION_LIST]);
- for($i = 1; $i < $this->num_iterators; $i++) {
- $i_docs =
- $this->index_bundle_iterators[$i]->currentDocsWithWord();
- if(isset($i_docs[$key][self::POSITION_LIST]) &&
- ($ct = count($i_docs[$key][self::POSITION_LIST]) > 0 )) {
- $position_lists[] = $i_docs[$key][self::POSITION_LIST];
- $len_lists[] = $ct;
+ for($i = 1; $i < $this->num_words; $i++) { //Modified by Ravi Dhillon
+ if($this->word_iterator_map[$i]<$i) { //Added by Ravi Dhillon
+ $position_lists[] = $position_lists[$this->word_iterator_map[$i]];
+ $docs[$key][self::RELEVANCE] +=
+ $docs[$key][self::RELEVANCE];
}
+ else { //Added by Ravi Dhillon
+ $i_docs =
+ $this->index_bundle_iterators[$this->word_iterator_map[$i]]->currentDocsWithWord(); //Modified by Ravi Dhillon
+ if(isset($i_docs[$key][self::POSITION_LIST]) &&
+ ($ct = count($i_docs[$key][self::POSITION_LIST]) > 0 )) {
+ $position_lists[] = $i_docs[$key][self::POSITION_LIST];
+ $len_lists[] = $ct;
+ }
- if(isset($i_docs[$key])) {
- $docs[$key][self::RELEVANCE] +=
- $i_docs[$key][self::RELEVANCE];
- }
+ if(isset($i_docs[$key])) {
+ $docs[$key][self::RELEVANCE] +=
+ $i_docs[$key][self::RELEVANCE];
+ }
+ }
}
if(count($position_lists) > 1) {
- $docs[$key][self::PROXIMITY] =
+ $docs[$key][self::PROXIMITY] =
$this->computeProximity($position_lists, $len_lists,
$docs[$key][self::IS_DOC]);
} else {
@@ -182,7 +202,7 @@ class IntersectIterator extends IndexBundleIterator
$docs[$key][self::SCORE] = $docs[$key][self::DOC_RANK] *
$docs[$key][self::RELEVANCE] * $docs[$key][self::PROXIMITY];
}
- $this->count_block = count($docs);
+ $this->count_block = count($docs);
$this->pages = $docs;
return $docs;
}
@@ -192,12 +212,13 @@ class IntersectIterator extends IndexBundleIterator
* a score for how close those words were in the given document
*
* @param array $position_lists a 2D array item number => position_list
- * (locations in doc where item occurred) for that item.
+ * (locations in doc where item occurred) for that item.
* @param array $len_lists length for each item of its position list
* @param bool $is_doc whether this is the position list of a document
* or a link
* @return sum of smallest abs of position differences between terms
*/
+ /*
function computeProximity(&$word_position_lists, &$word_len_lists, $is_doc)
{
$num_iterators = $this->num_iterators;
@@ -227,7 +248,7 @@ class IntersectIterator extends IndexBundleIterator
$positions = array($o_position);
for($i = 1; $i < $num; $i++) {
$positions[$i] = $position_lists[$i][$counters[$i]];
- if($positions[$i] < $o_position &&
+ if($positions[$i] < $o_position &&
$counters[$i] < $len_lists[$i] - 1) {
$min_counter = $i;
}
@@ -247,6 +268,100 @@ class IntersectIterator extends IndexBundleIterator
return $weight*($num - 1)/$min_diff;
}
+ */
+
+ /**
+ * Given the position_lists of a collection of terms computes
+ * a score for how close those words were in the given document
+ *
+ * @param array $position_lists a 2D array item number => position_list
+ * (locations in doc where item occurred) for that item.
+ * @param array $len_lists length for each item of its position list
+ * @param bool $is_doc whether this is the position list of a document
+ * or a link
+ * @return sum of inverse of all covers computed by plane sweep algorithm
+ */
+ function computeProximity(&$word_position_lists, &$word_len_lists, $is_doc)
+ {
+ $num_iterators = $this->num_iterators;
+ if($num_iterators < 1) return 1;
+
+ $covers = array();
+ $position_list = $word_position_lists;
+ $interval = array();
+ $num_words = count($position_list);
+ for ($i = 0; $i < $num_words; $i++) {
+ $min = array_shift($position_list[$i]);
+ if(isset($min)){
+ array_push($interval,array($min,$i));
+ for($j = 0;$j < $num_words; $j++){
+ if(isset($position_list[$j][0]) && $min == $position_list[$j][0]){
+ array_shift($position_list[$j]);
+ }
+ }
+ }
+ }
+
+ if(count($interval) != $num_words){
+ return 0;
+ }
+ sort($interval);
+ $l = array_shift($interval);
+ $r = end($interval);
+ $stop = false;
+ if(sizeof($position_list[$l[1]])==0){
+ $stop = true;
+ }
+ while(!$stop){
+ $p = array_shift($position_list[$l[1]]);
+ for ($i = 0;$i < $num_words; $i++){
+ if(isset($position_list[$i][0]) && $p == $position_list[$i][0]){
+ array_shift($position_list[$i]);
+ }
+ }
+ $q = $interval[0][0];
+ if($p>$r[0]){
+ array_push($covers,array($l[0],$r[0]));
+ array_push($interval,array($p,$l[1]));
+ }
+ else{
+ if($p<$q){
+ array_unshift($interval,array($p,$l[1]));
+ }
+ else{
+ array_push($interval,array($p,$l[1]));
+ sort($interval);
+ }
+ }
+ $l = array_shift($interval);
+ $r = end($interval);
+ if(sizeof($position_list[$l[1]])==0){
+ $stop = true;
+ }
+
+ }
+ array_push($covers,array($l[0],$r[0]));
+ $score = 0;
+ if($is_doc){
+ $weight = TITLE_WEIGHT;
+ $cover = array_shift($covers);
+ while(isset($cover[1]) && $cover[1] < AD_HOC_TITLE_LENGTH){
+ $score += ($weight/($cover[1]-$cover[0]+1));
+ $cover = array_shift($covers);
+ }
+ $weight = DESCRIPTION_WEIGHT;
+ foreach($covers as $cover){
+ $score += ($weight/($cover[1]-$cover[0]+1));
+ }
+ }
+ else{
+ $weight = LINK_WEIGHT;
+ foreach($covers as $cover){
+ $score += ($weight/($cover[1]-$cover[0]+1));
+ }
+ }
+ return $score;
+ }
/**
* Finds the next generation and doc offset amongst all the iterators
@@ -260,16 +375,16 @@ class IntersectIterator extends IndexBundleIterator
return -1;
}
$gen_doc_offset[0] = $biggest_gen_offset;
- $all_same = true;
+ $all_same = true;
for($i = 1; $i < $this->num_iterators; $i++) {
- $cur_gen_doc_offset =
+ $cur_gen_doc_offset =
$this->index_bundle_iterators[
$i]->currentGenDocOffsetWithWord();
$gen_doc_offset[$i] = $cur_gen_doc_offset;
if($cur_gen_doc_offset == -1) {
return -1;
}
- $gen_doc_cmp = $this->genDocOffsetCmp($cur_gen_doc_offset,
+ $gen_doc_cmp = $this->genDocOffsetCmp($cur_gen_doc_offset,
$biggest_gen_offset);
if($gen_doc_cmp > 0) {
$biggest_gen_offset = $cur_gen_doc_offset;
@@ -285,17 +400,17 @@ class IntersectIterator extends IndexBundleIterator
$i = 0;
while($i != $last_changed) {
if($last_changed == -1) $last_changed = 0;
- if($this->genDocOffsetCmp($gen_doc_offset[$i],
- $biggest_gen_offset) < 0) {
+ if($this->genDocOffsetCmp($gen_doc_offset[$i],
+ $biggest_gen_offset) < 0) {
$iterator = $this->index_bundle_iterators[$i];
$iterator->advance($biggest_gen_offset);
- $cur_gen_doc_offset =
+ $cur_gen_doc_offset =
$iterator->currentGenDocOffsetWithWord();
$gen_doc_offset[$i] = $cur_gen_doc_offset;
if($cur_gen_doc_offset == -1) {
return -1;
}
- if($this->genDocOffsetCmp($cur_gen_doc_offset,
+ if($this->genDocOffsetCmp($cur_gen_doc_offset,
$biggest_gen_offset) > 0) {
$last_changed = $i;
$biggest_gen_offset = $cur_gen_doc_offset;
@@ -314,10 +429,10 @@ class IntersectIterator extends IndexBundleIterator
* Forwards the iterator one group of docs
* @param array $gen_doc_offset a generation, doc_offset pair. If set,
* the must be of greater than or equal generation, and if equal the
- * next block must all have $doc_offsets larger than or equal to
+ * next block must all have $doc_offsets larger than or equal to
* this value
*/
- function advance($gen_doc_offset = null)
+ function advance($gen_doc_offset = null)
{
$this->advanceSeenDocs();
@@ -326,24 +441,24 @@ class IntersectIterator extends IndexBundleIterator
//num_docs can change when advance() called so that's why we recompute
$total_num_docs = 0;
for($i = 0; $i < $this->num_iterators; $i++) {
- $this->seen_docs_unfiltered +=
+ $this->seen_docs_unfiltered +=
$this->index_bundle_iterators[$i]->seen_docs;
$total_num_docs += $this->index_bundle_iterators[$i]->num_docs;
}
if($this->seen_docs_unfiltered > 0) {
- $this->num_docs =
+ $this->num_docs =
floor(($this->seen_docs * $total_num_docs) /
$this->seen_docs_unfiltered);
- }
+ }
$this->index_bundle_iterators[0]->advance($gen_doc_offset);
}
/**
- * Gets the doc_offset and generation for the next document that
+ * Gets the doc_offset and generation for the next document that
* would be return by this iterator
*
- * @return mixed an array with the desired document offset
+ * @return mixed an array with the desired document offset
* and generation; -1 on fail
*/
function currentGenDocOffsetWithWord() {
diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php
index 47b91adae..00482cb0a 100755
--- a/lib/phrase_parser.php
+++ b/lib/phrase_parser.php
@@ -1,5 +1,5 @@
<?php
-/**
+/**
* SeekQuarry/Yioop --
* Open Source Pure PHP Search Engine, Crawler, and Indexer
*
@@ -36,27 +36,27 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
/**
* Load the stem word functions, if necessary
*/
-foreach(glob(BASE_DIR."/lib/stemmers/*_stemmer.php")
- as $filename) {
+foreach(glob(BASE_DIR."/lib/stemmers/*_stemmer.php")
+ as $filename) {
require_once $filename;
}
-
+
/**
* Reads in constants used as enums used for storing web sites
*/
require_once BASE_DIR."/lib/crawl_constants.php";
/**
- * Library of functions used to manipulate words and phrases
+ * Library of functions used to manipulate words and phrases
*
* @author Chris Pollett
*
* @package seek_quarry
* @subpackage library
*/
-class PhraseParser
+class PhraseParser
{
- /**
+ /**
* Language tags and their corresponding stemmer
* @var array
*/
@@ -97,9 +97,9 @@ class PhraseParser
*/
static function extractWordStringPageSummary($page)
{
- $title_phrase_string = mb_ereg_replace(PUNCT, " ",
+ $title_phrase_string = mb_ereg_replace(PUNCT, " ",
$page[CrawlConstants::TITLE]);
- $description_phrase_string = mb_ereg_replace(PUNCT, " ",
+ $description_phrase_string = mb_ereg_replace(PUNCT, " ",
$page[CrawlConstants::DESCRIPTION]);
$page_string = $title_phrase_string . " " . $description_phrase_string;
@@ -107,9 +107,32 @@ class PhraseParser
return $page_string;
}
-
+
+ /**
+ * Extracts all phrases (sequences of adjacent words) from $string of
+ * length less than or equal to $len.
+ *
+ * @param string $string subject to extract phrases from
+ * @param int $len longest length of phrases to consider
+ * @param string $lang locale tag for stemming
+ * @return array of phrases
+ */
+ static function extractPhrases($string,
+ $len = MAX_PHRASE_LEN, $lang = NULL)
+ {
+ $phrases = array();
+
+ for($i = 0; $i < $len; $i++) {
+ $phrases =
+ array_merge($phrases,
+ self::extractPhrasesOfLength($string, $i, $lang));
+ }
+
+ return $phrases;
+ }
+
/**
- * Extracts all phrases (sequences of adjacent words) from $string of
+ * Extracts all phrases (sequences of adjacent words) from $string of
* length less than or equal to $len.
*
* @param string $string subject to extract phrases from
@@ -117,13 +140,13 @@ class PhraseParser
* @param string $lang locale tag for stemming
* @return array pairs of the form (phrase, number of occurrences)
*/
- static function extractPhrasesAndCount($string,
- $len = MAX_PHRASE_LEN, $lang = NULL)
+ static function extractPhrasesAndCount($string,
+ $len = MAX_PHRASE_LEN, $lang = NULL)
{
$phrases = array();
for($i = 0; $i < $len; $i++) {
- $phrases =
+ $phrases =
array_merge($phrases,
self::extractPhrasesOfLength($string, $i, $lang));
}
@@ -134,7 +157,7 @@ class PhraseParser
}
/**
- * Extracts all phrases (sequences of adjacent words) from $string of
+ * Extracts all phrases (sequences of adjacent words) from $string of
* length less than or equal to $len.
*
* @param string $string subject to extract phrases from
@@ -143,7 +166,7 @@ class PhraseParser
* @return array word => list of positions at which the word occurred in
* the document
*/
- static function extractPhrasesInLists($string,
+ static function extractPhrasesInLists($string,
$len = MAX_PHRASE_LEN, $lang = NULL)
{
$phrase_lists = array();
@@ -159,7 +182,7 @@ class PhraseParser
}
/**
- * Extracts all phrases (sequences of adjacent words) from $string of
+ * Extracts all phrases (sequences of adjacent words) from $string of
* length exactly equal to $len.
*
* @param string $string subject to extract phrases from
@@ -167,13 +190,13 @@ class PhraseParser
* @param string $lang locale tag for stemming
* @return array of phrases
*/
- static function extractPhrasesOfLength($string, $phrase_len, $lang = NULL)
+ static function extractPhrasesOfLength($string, $phrase_len, $lang = NULL)
{
$phrases = array();
-
+
for($i = 0; $i < $phrase_len; $i++) {
- $phrases = array_merge($phrases,
- self::extractPhrasesOfLengthOffset($string,
+ $phrases = array_merge($phrases,
+ self::extractPhrasesOfLengthOffset($string,
$phrase_len, $i, $lang));
}
@@ -181,7 +204,7 @@ class PhraseParser
}
/**
- * Extracts phrases (sequences of adjacent words) from $string of
+ * Extracts phrases (sequences of adjacent words) from $string of
* length exactly equal to $len, beginning with the $offset'th word.
* This extracts the the $len many words after offset, then the $len
* many words after that, and so on.
@@ -192,8 +215,8 @@ class PhraseParser
* @param string $lang locale tag for stemming
* @return array of phrases
*/
- static function extractPhrasesOfLengthOffset($string,
- $phrase_len, $offset, $lang = NULL)
+ static function extractPhrasesOfLengthOffset($string,
+ $phrase_len, $offset, $lang = NULL)
{
$words = mb_split("[[:space:]]|".PUNCT, $string);
@@ -208,7 +231,7 @@ class PhraseParser
if($words[$i] == "") {continue;}
$phrase_number = ($i - $offset)/$phrase_len;
- if(!isset($stems[$phrase_number])) {
+ if(!isset($stems[$phrase_number])) {
$stems[$phrase_number]="";
$first_time = "";
}
@@ -217,7 +240,7 @@ class PhraseParser
if($stemmer != NULL) {
$stem_obj = new $stemmer(); //for php 5.2 compatibility
- $stem = $stem_obj->stem($pre_stem);
+ $stem = $stem_obj->stem($pre_stem);
} else {
$stem = $pre_stem;
}
@@ -244,7 +267,7 @@ class PhraseParser
* Returns the characters n-grams for the given terms where n is the length
* Yioop uses for the language in question. If a stemmer is used for
* language then n-gramming is no done and this just returns an empty array
- *
+ *
* @param array $term the terms to make n-grams for
* @param string $lang locale tag to determine n to be used for n-gramming
*
@@ -257,7 +280,7 @@ class PhraseParser
} else {
return array();
}
-
+
$ngrams = array();
foreach($terms as $term) {
diff --git a/models/phrase_model.php b/models/phrase_model.php
index a8f43e047..979cccc68 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -1,5 +1,5 @@
<?php
-/**
+/**
* SeekQuarry/Yioop --
* Open Source Pure PHP Search Engine, Crawler, and Indexer
*
@@ -33,20 +33,20 @@
if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
-/**
- * logging is done during crawl not through web,
- * so it will not be used in the phrase model
+/**
+ * logging is done during crawl not through web,
+ * so it will not be used in the phrase model
*/
if(!defined("POST_PROCESSING")) {
define("LOG_TO_FILES", false);
-}
+}
/** For crawlHash function */
require_once BASE_DIR."/lib/utility.php";
/** For extractPhrasesAndCount function */
-require_once BASE_DIR."/lib/phrase_parser.php";
-
-/**
- * Used to look up words and phrases in the inverted index
+require_once BASE_DIR."/lib/phrase_parser.php";
+
+/**
+ * Used to look up words and phrases in the inverted index
* associated with a given crawl
*/
require_once BASE_DIR."/lib/index_archive_bundle.php";
@@ -59,13 +59,13 @@ require_once(BASE_DIR."/lib/file_cache.php");
/**
* Load iterators to get docs out of index archive
*/
-foreach(glob(BASE_DIR."/lib/index_bundle_iterators/*_iterator.php")
- as $filename) {
+foreach(glob(BASE_DIR."/lib/index_bundle_iterators/*_iterator.php")
+ as $filename) {
require_once $filename;
}
/**
- *
+ *
* This is class is used to handle
* results for a given phrase search
*
@@ -73,7 +73,7 @@ foreach(glob(BASE_DIR."/lib/index_bundle_iterators/*_iterator.php")
* @package seek_quarry
* @subpackage model
*/
-class PhraseModel extends Model
+class PhraseModel extends Model
{
/** used to hold the name of index archive to look summaries up in
@@ -81,7 +81,7 @@ class PhraseModel extends Model
*/
var $index_name;
- /** an associative array of additional meta words and
+ /** an associative array of additional meta words and
* the max description length of results if such a meta word is used
* this array is typically set in index.php
*
@@ -103,7 +103,7 @@ class PhraseModel extends Model
/**
* {@inheritdoc}
*/
- function __construct($db_name = DB_NAME)
+ function __construct($db_name = DB_NAME)
{
parent::__construct($db_name);
}
@@ -155,14 +155,14 @@ class PhraseModel extends Model
preg_match_all($pattern, $query, $matches);
if(isset($matches[2][0])) {
$base_weight = substr($matches[2][0],strlen("weight:"));
- $disjunct_string =
+ $disjunct_string =
preg_replace($pattern,"", $disjunct_string);
}
$pattern = "/(\s)(w:(\S)+)/";
preg_match_all($pattern, $query, $matches);
if(isset($matches[2][0])) {
$base_weight = substr($matches[2][0],strlen("w:"));
- $disjunct_string =
+ $disjunct_string =
preg_replace($pattern,"", $disjunct_string);
}
$pipe2 = "";
@@ -182,7 +182,7 @@ class PhraseModel extends Model
}
$num_results = (isset($group['RESULT_BOUND']) &&
- $group['RESULT_BOUND'] > 1) ?
+ $group['RESULT_BOUND'] > 1) ?
$group['RESULT_BOUND'] : 1;
$rewrite .= " #$num_results# ";
}
@@ -191,13 +191,13 @@ class PhraseModel extends Model
}
/**
- * Given a query phrase, returns formatted document summaries of the
+ * Given a query phrase, returns formatted document summaries of the
* documents that match the phrase.
*
* @param string $phrase the phrase to try to match
* @param int $low return results beginning with the $low document
* @param int $results_per_page how many results to return
- * @param bool $format whether to highlight in the returned summaries the
+ * @param bool $format whether to highlight in the returned summaries the
* matched text
* @param array $filter an array of hashes of domains to filter from
* results
@@ -206,31 +206,31 @@ class PhraseModel extends Model
* the file cache or memcache. Otherwise, items will be recomputed
* and then potentially restored in cache
* @param int $raw ($raw == 0) normal grouping, ($raw == 1)
- * no grouping but page look-up for links, ($raw == 2)
+ * no grouping but page look-up for links, ($raw == 2)
* no grouping done on data
*
* @return array an array of summary data
*/
function getPhrasePageResults(
- $input_phrase, $low = 0, $results_per_page = NUM_RESULTS_PER_PAGE,
+ $input_phrase, $low = 0, $results_per_page = NUM_RESULTS_PER_PAGE,
$format = true, $filter = NULL, $use_cache_if_allowed = true,
$raw = 0)
{
if(QUERY_STATISTICS) {
$indent= " ";
- $in2 = $indent . $indent;
+ $in2 = $indent . $indent;
$in3 = $in2 . $indent;
$prs_cnt = 0;
$dis_cnt = 0;
$this->query_info = array();
- $this->query_info['QUERY'] =
+ $this->query_info['QUERY'] =
"<b>PHRASE QUERY</b>: ".$input_phrase."<br />";
$start_time = microtime();
}
$results = NULL;
$word_structs = array();
- /*
+ /*
this is a quick and dirty parsing and will usually work,
exceptions would be # or | in quotes or if someone tried
to escape |.
@@ -238,7 +238,7 @@ class PhraseModel extends Model
First we split into presentation elements then we split by
disjuncts
*/
- $presentation_parts = preg_split('/#(\d)+#/',
+ $presentation_parts = preg_split('/#(\d)+#/',
$input_phrase, -1, PREG_SPLIT_DELIM_CAPTURE);
$count = 0;
@@ -251,9 +251,9 @@ class PhraseModel extends Model
for($i = 0; $i < $num_parts ; $i++) {
if(isset($presentation_parts[$i][0]) &&
($trimmed = trim($presentation_parts[$i][0])) != "" ) {
- $to_return = (isset($presentation_parts[$i][1])) ?
+ $to_return = (isset($presentation_parts[$i][1])) ?
$presentation_parts[$i][1]: 1;
- $query_parts[$trimmed][] =
+ $query_parts[$trimmed][] =
array($count, $to_return);
$last_part = $trimmed;
if(isset($presentation_parts[$i][1])) {
@@ -266,7 +266,7 @@ class PhraseModel extends Model
$results_high = $low + $results_per_page;
$num_last_parts = count($query_parts[$last_part]);
- if($query_parts[$last_part][$num_last_parts - 1][0] +
+ if($query_parts[$last_part][$num_last_parts - 1][0] +
$query_parts[$last_part][$num_last_parts - 1][1] < $low) {
$query_parts[$last_part][$num_last_parts - 1][1] = $results_high;
}
@@ -292,22 +292,22 @@ class PhraseModel extends Model
}
if($num_bounds == 0) continue;
if($phrase == $last_part &&
- $result_bounds[$num_bounds - 1][0] +
+ $result_bounds[$num_bounds - 1][0] +
$result_bounds[$num_bounds - 1][1] < $results_high) {
- $result_bounds[$num_bounds - 1][1] = $results_high -
+ $result_bounds[$num_bounds - 1][1] = $results_high -
$result_bounds[$num_bounds - 1][0];
}
- $phrase_num = max(min($phrase_high, $results_high), $results_high) -
+ $phrase_num = max(min($phrase_high, $results_high), $results_high) -
$low;
$disjunct_phrases = explode("|", $phrase);
$word_structs = array();
if(QUERY_STATISTICS) {
- $this->query_info['QUERY'] .= $indent .
+ $this->query_info['QUERY'] .= $indent .
"<b>Presentation $prs_cnt:</b><br />";
$this->query_info['QUERY'] .= "$in2<i>Low</i>:".
$result_bounds[0][0]."<br />";
- $this->query_info['QUERY'] .= $in2 .
+ $this->query_info['QUERY'] .= $in2 .
"<i>High</i>: ".$result_bounds[0][1]."<br />";
$prs_cnt++;
}
@@ -319,31 +319,31 @@ class PhraseModel extends Model
. "</b><br />";
$dis_cnt++;
}
- list($word_struct, $format_words) =
+ list($word_struct, $format_words) =
$this->parseWordStructConjunctiveQuery($disjunct);
if($word_struct != NULL) {
$word_structs[] = $word_struct;
}
}
- if(QUERY_STATISTICS) {
- $this->query_info['QUERY'] .=
+ if(QUERY_STATISTICS) {
+ $this->query_info['QUERY'] .=
"$in2<b>Presentation Parse time</b>: " .
changeInMicrotime($start_time)."<br />";
$summaries_time = microtime();
}
- $out_results = $this->getSummariesByHash($word_structs,
+ $out_results = $this->getSummariesByHash($word_structs,
$low, $phrase_num, $filter, $use_cache_if_allowed, $raw);
- if(isset($out_results['PAGES']) &&
+ if(isset($out_results['PAGES']) &&
count($out_results['PAGES']) != 0) {
$out_count = 0;
foreach($result_bounds as $bound) {
- for($i = $bound[0];
+ for($i = $bound[0];
$i < min($bound[0] + $bound[1], $results_high);
$i++) {
if(isset($out_results['PAGES'][$out_count])) {
- $results['PAGES'][$i] =
+ $results['PAGES'][$i] =
$out_results['PAGES'][$out_count];
$out_count++;
}
@@ -353,7 +353,7 @@ class PhraseModel extends Model
$total_rows = $out_results['TOTAL_ROWS'];
}
}
- if(QUERY_STATISTICS) {
+ if(QUERY_STATISTICS) {
$this->query_info['QUERY'] .= "$in2<b>Get Summaries time</b>: ".
changeInMicrotime($summaries_time)."<br />";
$format_time = microtime();
@@ -376,7 +376,7 @@ class PhraseModel extends Model
} else {
$results['TOTAL_ROWS'] = count($results['PAGES']);
}
-
+
if($format) {
if(count($format_words) == 0 ){
$format_words = NULL;
@@ -386,7 +386,7 @@ class PhraseModel extends Model
}
$description_length = self::DEFAULT_DESCRIPTION_LENGTH;
- if(isset($this->additional_meta_words) &&
+ if(isset($this->additional_meta_words) &&
is_array($this->additional_meta_words)) {
foreach($this->additional_meta_words as $meta_word => $length){
$pattern = "/$meta_word/";
@@ -396,10 +396,10 @@ class PhraseModel extends Model
}
}
}
- $output = $this->formatPageResults($results, $format_words,
+ $output = $this->formatPageResults($results, $format_words,
$description_length);
- if(QUERY_STATISTICS) {
+ if(QUERY_STATISTICS) {
$this->query_info['QUERY'] .= "<b>Format time</b>: ".
changeInMicrotime($format_time)."<br />";
$this->query_info['ELAPSED_TIME'] = changeInMicrotime($start_time);
@@ -412,7 +412,7 @@ class PhraseModel extends Model
/**
* Determines the offset into the summaries WebArchiveBundle of the
- * provided url so that the info:url summary can be retrieved.
+ * provided url so that the info:url summary can be retrieved.
* This assumes of course that the info:url meta word has been stored.
*
* @param string $url what to lookup
@@ -427,11 +427,11 @@ class PhraseModel extends Model
$pages = array();
$summary_offset = NULL;
$num_generations = $index_archive->generation_info['ACTIVE'];
- $word_iterator =
+ $word_iterator =
new WordIterator(crawlHash("info:$url"), $index_archive);
if(is_array($next_docs = $word_iterator->nextDocsWithWord())) {
foreach($next_docs as $doc_key => $doc_info) {
- $summary_offset =
+ $summary_offset =
$doc_info[CrawlConstants::SUMMARY_OFFSET];
$generation = $doc_info[CrawlConstants::GENERATION];
$cache_partition = $doc_info[CrawlConstants::SUMMARY][
@@ -443,7 +443,7 @@ class PhraseModel extends Model
}
if($num_retrieved == 0) {
return false;
- }
+ }
} else {
return false;
}
@@ -483,14 +483,14 @@ class PhraseModel extends Model
foreach($meta_words as $meta_word) {
$pattern = "/(\s)($meta_word(\S)+)/";
preg_match_all($pattern, $phrase, $matches);
- if(!in_array($meta_word, array('i:', 'index:', 'w:',
+ if(!in_array($meta_word, array('i:', 'index:', 'w:',
'weight:', '\-') )) {
$matches = $matches[2];
$found_metas = array_merge($found_metas, $matches);
} else if($meta_word == '\-') {
if(count($matches[0]) > 0) {
- $disallow_phrases =
- array_merge($disallow_phrases,
+ $disallow_phrases =
+ array_merge($disallow_phrases,
array(substr($matches[2][0],1)));
}
} else if ($meta_word == 'i:' || $meta_word == 'index:') {
@@ -512,21 +512,25 @@ class PhraseModel extends Model
$phrase_string = mb_ereg_replace(PUNCT, " ", $phrase_string);
$phrase_string = preg_replace("/(\s)+/", " ", $phrase_string);
/*
- we search using the stemmed/char-grammed words, but we format
+ we search using the stemmed/char-grammed words, but we format
snippets in the results by bolding either
*/
$query_words = explode(" ", $phrase_string); //not stemmed
- $base_words =
- array_keys(PhraseParser::extractPhrasesAndCount($phrase_string,
- MAX_PHRASE_LEN, getLocaleTag())); //stemmed, if have stemmer
+ /*$base_words = //Commented by Ravi Dhillon
+ array_keys(PhraseParser::extractPhrasesAndCount($phrase_string,
+ MAX_PHRASE_LEN, getLocaleTag())); //stemmed, if have stemmer
+ */
+ $base_words = //Added by Ravi Dhillon
+ PhraseParser::extractPhrases($phrase_string,MAX_PHRASE_LEN,
+ getLocaleTag()); //stemmed, if have stemmer
$words = array_merge($base_words, $found_metas);
if(QUERY_STATISTICS) {
$this->query_info['QUERY'] .= "$in3<i>Index</i>: ".
$index_archive_name."<br />";
$this->query_info['QUERY'] .= "$in3<i>LocaleTag</i>: ".
getLocaleTag()."<br />";
- $this->query_info['QUERY'] .=
+ $this->query_info['QUERY'] .=
"$in3<i>Stemmed/Char-grammed Words</i>:<br />";
foreach($base_words as $word){
$this->query_info['QUERY'] .= "$in4$word<br />";
@@ -536,7 +540,7 @@ class PhraseModel extends Model
$this->query_info['QUERY'] .= "$in4$word<br />";
}
}
- if(isset($words) && count($words) == 1 &&
+ if(isset($words) && count($words) == 1 &&
count($disallow_phrases) < 1) {
$phrase_string = $words[0];
$phrase_hash = crawlHash($phrase_string);
@@ -545,19 +549,19 @@ class PhraseModel extends Model
"WEIGHT" => $weight, "INDEX_ARCHIVE" => $index_archive
);
} else {
- /*
- handle strings in quotes
+ /*
+ handle strings in quotes
(we want an exact match on such quoted strings)
*/
$quoteds =array();
$hash_quoteds = array();
- $num_quotes =
+ $num_quotes =
preg_match_all('/\"((?:[^\"\\\]|\\\\.)*)\"/', $phrase,$quoteds);
if(isset($quoteds[1])) {
$quoteds = $quoteds[1];
}
- //get a raw list of words and their hashes
+ //get a raw list of words and their hashes
$hashes = array();
$i = 0;
@@ -567,7 +571,7 @@ class PhraseModel extends Model
$restrict_phrases = $quoteds;
- $hashes = array_unique($hashes);
+ //$hashes = array_unique($hashes); //Commented by Ravi Dhillon
if(count($hashes) > 0) {
$word_keys = array_slice($hashes, 0, MAX_QUERY_TERMS);
} else {
@@ -582,14 +586,14 @@ class PhraseModel extends Model
$num_disallow_keys = min(MAX_QUERY_TERMS, count($disallow_phrases));
for($i = 0; $i < $num_disallow_keys; $i++) {
$disallow_stem=array_keys(PhraseParser::extractPhrasesAndCount(
- $disallow_phrases[$i], 2, getLocaleTag()));
+ $disallow_phrases[$i], 2, getLocaleTag()));
//stemmed
$disallow_keys[] = crawlHash($disallow_stem[0]);
}
if($word_keys !== NULL) {
$word_struct = array("KEYS" => $word_keys,
- "RESTRICT_PHRASES" => $restrict_phrases,
+ "RESTRICT_PHRASES" => $restrict_phrases,
"DISALLOW_KEYS" => $disallow_keys,
"WEIGHT" => $weight,
"INDEX_ARCHIVE" => $index_archive
@@ -630,36 +634,36 @@ class PhraseModel extends Model
/**
* Given a page summary extract the words from it and try to find documents
- * which match the most relevant words. The algorithm for "relevant" is
- * pretty weak. For now we pick the $num many words which appear in the
+ * which match the most relevant words. The algorithm for "relevant" is
+ * pretty weak. For now we pick the $num many words which appear in the
* fewest documents.
*
* @param string $crawl_item a page summary
* @param int $num number of key phrase to return
* @return array an array of most selective key phrases
*/
- function getTopPhrases($crawl_item, $num)
+ function getTopPhrases($crawl_item, $num)
{
$index_archive_name = self::index_data_base_name . $this->index_name;
- $index_archive =
+ $index_archive =
new IndexArchiveBundle(CRAWL_DIR.'/cache/'.$index_archive_name);
- $phrase_string =
+ $phrase_string =
PhraseParser::extractWordStringPageSummary($crawl_item);
- $words =
+ $words =
array_keys(PhraseParser::extractPhrasesAndCount($phrase_string));
$hashes = array();
$lookup = array();
foreach($words as $word) {
- $tmp = crawlHash($word);
+ $tmp = crawlHash($word);
$hashes[] = $tmp;
$lookup[$tmp] = $word;
}
- $words_array =
+ $words_array =
$index_archive->getSelectiveWords($hashes, $num, "greaterThan");
$word_keys = array_keys($words_array);
$phrases = array();
@@ -691,7 +695,7 @@ class PhraseModel extends Model
* the file cache or memcache. Otherwise, items will be recomputed
* and then potentially restored in cache
* @param int $raw ($raw == 0) normal grouping, ($raw == 1)
- * no grouping but page look-up for links, ($raw == 2)
+ * no grouping but page look-up for links, ($raw == 2)
* no grouping done on data
*
* @return array document summaries
@@ -703,7 +707,7 @@ class PhraseModel extends Model
$pages = array();
$generation = 0;
- $to_retrieve = ceil(($limit+$num)/self::NUM_CACHE_PAGES) *
+ $to_retrieve = ceil(($limit+$num)/self::NUM_CACHE_PAGES) *
self::NUM_CACHE_PAGES;
$start_slice = floor(($limit)/self::NUM_CACHE_PAGES) *
self::NUM_CACHE_PAGES;
@@ -727,13 +731,13 @@ class PhraseModel extends Model
$cache_success = false;
break;
}
- $results['PAGES'] = array_merge($results['PAGES'],
+ $results['PAGES'] = array_merge($results['PAGES'],
$slice['PAGES']);
$results['TOTAL_ROWS'] = $slice['TOTAL_ROWS'];
}
if($cache_success) {
- $results['PAGES'] =
- array_slice($results['PAGES'],
+ $results['PAGES'] =
+ array_slice($results['PAGES'],
$limit - $start_slice, $num);
return $results;
}
@@ -744,7 +748,7 @@ class PhraseModel extends Model
$num_retrieved = 0;
$pages = array();
- while(is_object($query_iterator) &&
+ while(is_object($query_iterator) &&
is_array($next_docs = $query_iterator->nextDocsWithWord()) &&
$num_retrieved < $to_retrieve) {
foreach($next_docs as $doc_key => $doc_info) {
@@ -766,7 +770,7 @@ class PhraseModel extends Model
$results['TOTAL_ROWS'] = $num_retrieved;
} else {
$results['TOTAL_ROWS'] = $query_iterator->num_docs;
- //this is only an approximation
+ //this is only an approximation
}
$result_count = count($pages);
@@ -776,7 +780,7 @@ class PhraseModel extends Model
}
for($i = 0;$i < $to_retrieve;$i+=self::NUM_CACHE_PAGES){
$summary_hash = crawlHash($mem_tmp.":".$i);
- $slice['PAGES'] = array_slice($pages, $i,
+ $slice['PAGES'] = array_slice($pages, $i,
self::NUM_CACHE_PAGES);
$slice['TOTAL_ROWS'] = $results['TOTAL_ROWS'];
$CACHE->set($summary_hash, $slice);
@@ -785,7 +789,7 @@ class PhraseModel extends Model
}
$results['PAGES'] = & $pages;
$results['PAGES'] = array_slice($results['PAGES'], $start_slice);
- $results['PAGES'] = array_slice($results['PAGES'], $limit -
+ $results['PAGES'] = array_slice($results['PAGES'], $limit -
$start_slice, $num);
@@ -794,7 +798,7 @@ class PhraseModel extends Model
/**
- * Using the supplied $word_structs, contructs an iterator for getting
+ * Using the supplied $word_structs, contructs an iterator for getting
* results to a query
*
* @param array $word_structs an array of word_structs. Here a word_struct
@@ -808,10 +812,10 @@ class PhraseModel extends Model
* results
* and then potentially restored in cache
* @param int $raw ($raw == 0) normal grouping, ($raw == 1)
- * no grouping but page look-up for links, ($raw == 2)
+ * no grouping but page look-up for links, ($raw == 2)
* no grouping done on data
*
- * @return &object an iterator for iterating through results to the
+ * @return &object an iterator for iterating through results to the
* query
*/
function getQueryIterator($word_structs, &$filter, $raw = 0)
@@ -821,26 +825,33 @@ class PhraseModel extends Model
foreach($word_structs as $word_struct) {
if(!is_array($word_struct)) { continue;}
$word_keys = $word_struct["KEYS"];
+ $distinct_word_keys = array_unique($word_keys); //Added by Ravi Dhillon
$restrict_phrases = $word_struct["RESTRICT_PHRASES"];
$disallow_keys = $word_struct["DISALLOW_KEYS"];
$index_archive = $word_struct["INDEX_ARCHIVE"];
$weight = $word_struct["WEIGHT"];
$num_word_keys = count($word_keys);
- $total_iterators += $num_word_keys;
+ $total_iterators = count($distinct_word_keys); //Modified by Ravi Dhillon
$word_iterators = array();
+ $word_iterator_map = array(); //Added by Ravi Dhillon
if($num_word_keys < 1) {continue;}
- for($i = 0; $i < $num_word_keys; $i++) {
- $word_iterators[$i] =
- new WordIterator($word_keys[$i], $index_archive,
- false, $filter);
+ for($i = 0; $i < $total_iterators; $i++) { //Modified by Ravi Dhillon
+ $word_iterators[$i] =
+ new WordIterator($distinct_word_keys[$i], $index_archive, //Modified by Ravi Dhillon
+ false, $filter);
+ foreach ($word_keys as $index => $key) { //Added by Ravi Dhillon
+ if($key == $distinct_word_keys[$i]){
+ $word_iterator_map[$index] = $i;
+ }
+ }
}
$num_disallow_keys = count($disallow_keys);
if($num_disallow_keys > 0) {
for($i = 0; $i < $num_disallow_keys; $i++) {
- $disallow_iterator =
- new WordIterator($disallow_keys[$i], $index_archive,
+ $disallow_iterator =
+ new WordIterator($disallow_keys[$i], $index_archive,
false, $filter);
$word_iterators[$num_word_keys + $i] =
new NegationIterator($disallow_iterator);
@@ -851,13 +862,13 @@ class PhraseModel extends Model
if($num_word_keys == 1) {
$base_iterator = $word_iterators[0];
} else {
- $base_iterator = new IntersectIterator($word_iterators);
+ $base_iterator = new IntersectIterator($word_iterators,$word_iterator_map); //Modified by Ravi Dhillon
}
if($restrict_phrases == NULL && $disallow_keys == array() &&
$weight == 1) {
$iterators[] = $base_iterator;
} else {
- $iterators[] = new PhraseFilterIterator($base_iterator,
+ $iterators[] = new PhraseFilterIterator($base_iterator,
$restrict_phrases, $weight);
}
@@ -877,10 +888,10 @@ class PhraseModel extends Model
$group_iterator = $union_iterator;
} else if ($raw == 1) {
- $group_iterator =
+ $group_iterator =
new GroupIterator($union_iterator, $total_iterators, true);
} else {
- $group_iterator =
+ $group_iterator =
new GroupIterator($union_iterator, $total_iterators);
}