diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php
index f05aad0..f9a134e 100644
--- a/lib/index_bundle_iterators/intersect_iterator.php
+++ b/lib/index_bundle_iterators/intersect_iterator.php
@@ -90,18 +90,35 @@ class IntersectIterator extends IndexBundleIterator
var $num_words;
/**
+ * This iterator returns only documents containing all the elements of
+ * restrict phrases
+ * @var array
+ */
+ var $restrict_phrases;
+
+ /**
+ * A weighting factor to multiply with each doc SCORE returned from this
+ * iterator
+ * @var float
+ */
+ var $weight;
+
+ /**
* Creates an intersect iterator with the given parameters.
*
* @param object $index_bundle_iterator to use as a source of documents
* to iterate over
*/
- function __construct($index_bundle_iterators, $word_iterator_map)
+ function __construct($index_bundle_iterators, $word_iterator_map,
+ $restrict_phrases = NULL, $weight = 1)
{
$this->index_bundle_iterators = $index_bundle_iterators;
$this->word_iterator_map = $word_iterator_map;
$this->num_words = count($word_iterator_map);
$this->num_iterators = count($index_bundle_iterators);
$this->num_docs = 0;
+ $this->restrict_phrases = $restrict_phrases;
+ $this->weight = $weight;
$this->results_per_block = 1;
/*
@@ -165,7 +182,7 @@ class IntersectIterator extends IndexBundleIterator
}
//next we finish computing BM25F
$docs = $this->index_bundle_iterators[0]->currentDocsWithWord();
-
+ $weight = $this->weight;
if(is_array($docs) && count($docs) == 1) {
//we get intersect docs one at a time so should be only one
$keys = array_keys($docs);
@@ -206,6 +223,12 @@ class IntersectIterator extends IndexBundleIterator
}
$docs[$key][self::SCORE] = $docs[$key][self::DOC_RANK] *
$docs[$key][self::RELEVANCE] * $docs[$key][self::PROXIMITY];
+ if($weight != 1) {
+ $docs[$key][self::DOC_RANK] *= $weight;
+ $docs[$key][self::RELEVANCE] *= $weight;
+ $docs[$key][self::PROXIMITY] *= $weight;
+ $docs[$key][self::SCORE] *= $weight;
+ }
}
$this->count_block = count($docs);
$this->pages = $docs;
@@ -279,8 +302,8 @@ class IntersectIterator extends IndexBundleIterator
if(count($position_list[$l[1]]) == 0) {
$stop = true;
}
-
}
+
array_push($covers, array($l[0],$r[0]));
$score = 0;
if($is_doc) {
diff --git a/lib/index_bundle_iterators/phrase_filter_iterator.php b/lib/index_bundle_iterators/phrase_filter_iterator.php
deleted file mode 100644
index 1c0847b..0000000
--- a/lib/index_bundle_iterators/phrase_filter_iterator.php
+++ /dev/null
@@ -1,294 +0,0 @@
-<?php
-/**
- * SeekQuarry/Yioop --
- * Open Source Pure PHP Search Engine, Crawler, and Indexer
- *
- * Copyright (C) 2009 - 2012 Chris Pollett chris@pollett.org
- *
- * LICENSE:
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * END LICENSE
- *
- * @author Chris Pollett chris@pollett.org
- * @package seek_quarry
- * @subpackage iterator
- * @license http://www.gnu.org/licenses/ GPL3
- * @link http://www.seekquarry.com/
- * @copyright 2009 - 2012
- * @filesource
- */
-
-if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
-
-/**
- *Loads base class for iterating
- */
-require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php';
-
-/**
- * Used to iterate through a collection of documents to return only those
- * which have certain restricted_phrases.
- *
- * For restricted_phrases a string like "Chris * Homepage" will match any
- * string where * has been replace by any other string. So for example it will
- * match Chris Pollett's Homepage.
- *
- *
- * @author Chris Pollett
- * @package seek_quarry
- * @subpackage iterator
- * @see IndexArchiveBundle
- */
-class PhraseFilterIterator extends IndexBundleIterator
-{
- /**
- * The iterator we are using to get documents from
- * @var string
- */
- var $index_bundle_iterator;
-
- /**
- * This iterator returns only documents containing all the elements of
- * restrict phrases
- * @var array
- */
- var $restrict_phrases;
-
- /**
- * The number of documents in the current block before filtering
- * by restricted words
- * @var int
- */
- var $count_block_unfiltered;
-
- /**
- * The number of iterated docs before the restriction test
- * @var int
- */
- var $seen_docs_unfiltered;
-
- /**
- * Doc block with summaries for current doc block
- * @var array
- */
- var $summaries;
-
- /**
- * A weighting factor to multiply with each doc SCORE returned from this
- * iterator
- * @var float
- */
- var $weight;
-
- /**
- * Creates a phrase filter iterator with the given parameters.
- *
- * @param object $index_bundle_iterator to use as a source of documents
- * to iterate over
- * @param array $restrict_phrases this iterator returns only documents from
- * $index_bundle_iterator containing all the elements of restrict
- * phrases
- * @param float $weight a quantity to multiply each score returned from
- * this iterator with
- */
- function __construct($index_bundle_iterator, $restrict_phrases,
- $weight = 1)
- {
- $this->index_bundle_iterator = $index_bundle_iterator;
- $this->restrict_phrases = $restrict_phrases;
- $this->num_docs = $this->index_bundle_iterator->num_docs;
- $this->results_per_block =
- $this->index_bundle_iterator->results_per_block;
- $this->weight = $weight;
- $this->current_block_fresh = false;
- $this->reset();
- }
-
- /**
- * Returns the iterators to the first document block that it could iterate
- * over
- */
- function reset()
- {
- $this->index_bundle_iterator->reset();
- $this->seen_docs = 0;
- $this->seen_docs_unfiltered = 0;
- $doc_block = $this->currentDocsWithWord();
- }
-
- /**
- * Computes a relevancy score for a posting offset with respect to this
- * iterator and generation
- * @param int $generation the generation the posting offset is for
- * @param int $posting_offset an offset into word_docs to compute the
- * relevance of
- * @return float a relevancy score based on BM25F.
- */
- function computeRelevance($generation, $posting_offset)
- {
- return $this->index_bundle_iterator->computeRelevance(
- $generation, $posting_offset);
- }
-
- /**
- * Hook function used by currentDocsWithWord to return the current block
- * of docs if it is not cached
- *
- * @return mixed doc ids and score if there are docs left, -1 otherwise
- */
- function findDocsWithWord()
- {
- $pages = $this->index_bundle_iterator->getSummariesFromCurrentDocs();
- $this->count_block_unfiltered = count($pages);
- if(!is_array($pages)) {
- return $pages;
- }
- $out_pages = array();
- if(count($pages) > 0 ) {
- foreach($pages as $doc_key => $doc_info) {
- if(isset($doc_info[self::SUMMARY_OFFSET])) {
- /*
- if have SUMMARY_OFFSET then should have tried to get
- TITLE, etc.
- */
- $page_string =
- PhraseParser::extractWordStringPageSummary(
- $doc_info[self::SUMMARY]);
-
- $found = true;
-
- if($this->restrict_phrases != NULL) {
- foreach($this->restrict_phrases as $pre_phrase) {
- $phrase_parts = explode("*", $pre_phrase);
-
- $phrase = "";
- $first= "";
- foreach($phrase_parts as $part) {;
- $phrase .= $first . preg_quote($part);
- $first= '(.)*';
- }
-
- if(strlen($phrase) > 0 &&
- mb_eregi($phrase, $page_string) === false) {
- $found = false;
- }
- }
- }
-
- if($found == true) {
- $doc_info["WEIGHT"] = $this->weight;
- $doc_info[self::DOC_RANK] *= $this->weight;
- $doc_info[self::RELEVANCE] *= $this->weight;
- $doc_info[self::PROXIMITY] *= $this->weight;
- $doc_info[self::SCORE] *= $this->weight;
- $out_pages[$doc_key] = $doc_info;
- }
- }
- }
- $pages = $out_pages;
- }
- $this->count_block = count($pages);
-
- $this->summaries = $pages;
- $this->pages = array();
- foreach($pages as $doc_key => $doc_info) {
- $this->pages[$doc_key] = $doc_info;
- unset($this->pages[$doc_key][self::SUMMARY]);
- }
- return $pages;
-
- }
-
- /**
- * Gets the summaries associated with the keys provided the keys
- * can be found in the current block of docs returned by this iterator
- * @param array $keys keys to try to find in the current block of returned
- * results
- * @return array doc summaries that match provided keys
- */
- function getSummariesFromCurrentDocs($keys = NULL, $get_summaries = true)
- {
- if($this->current_block_fresh == false) {
- $result = $this->currentDocsWithWord();
- if(!is_array($result)) {
- return $result;
- }
- }
- if(!is_array($this->pages)) {
- return $this->pages;
- }
- if($keys == NULL) {
- $keys = array_keys($this->pages);
- }
- $out_pages = array();
- foreach($keys as $doc_key) {
- if(!isset($this->summaries[$doc_key])) {
- continue;
- } else {
- $out_pages[$doc_key] = $this->summaries[$doc_key];
- }
- }
- return $out_pages;
- }
-
- /**
- * Forwards the iterator one group of docs
- * @param array $gen_doc_offset a generation, doc_offset pair. If set,
- * the must be of greater than or equal generation, and if equal the
- * next block must all have $doc_offsets larger than or equal to
- * this value
- */
- function advance($gen_doc_offset = null)
- {
- $this->advanceSeenDocs();
-
-
- $this->seen_docs_unfiltered += $this->count_block_unfiltered;
-
-
-
- if($this->seen_docs_unfiltered > 0) {
- $this->num_docs =
- floor(($this->seen_docs*$this->index_bundle_iterator->num_docs)/
- $this->seen_docs_unfiltered);
- } else {
- $this->num_docs = 0;
- }
-
- $this->index_bundle_iterator->advance($gen_doc_offset);
- }
-
- /**
- * Gets the doc_offset and generation for the next document that
- * would be return by this iterator
- *
- * @return mixed an array with the desired document offset
- * and generation; -1 on fail
- */
- function currentGenDocOffsetWithWord() {
- $this->index_bundle_iterator->currentDocOffsetGenWithWord();
- }
-
- /**
- * Returns the index associated with this iterator
- * @return &object the index
- */
- function getIndex($key = NULL)
- {
- return $this->index_bundle_iterator->getIndex($key = NULL);
- }
-}
-?>
diff --git a/models/phrase_model.php b/models/phrase_model.php
index caec388..00707b1 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -553,6 +553,8 @@ class PhraseModel extends ParallelModel
$hashes[] = crawlHash($word);
}
+ $quoteds = array_unique($quoteds);
+ $quoteds = array_filter($quoteds);
$restrict_phrases = $quoteds;
if(count($hashes) > 0) {
@@ -561,16 +563,15 @@ class PhraseModel extends ParallelModel
$word_keys = NULL;
$word_struct = NULL;
}
- $restrict_phrases = array_unique($restrict_phrases);
- $restrict_phrases = array_filter($restrict_phrases);
+
if(!$index_dummy_flag) {
$index_archive->setCurrentShard(0, true);
}
$disallow_keys = array();
$num_disallow_keys = min(MAX_QUERY_TERMS, count($disallow_phrases));
for($i = 0; $i < $num_disallow_keys; $i++) {
- $disallow_stem=array_keys(PhraseParser::extractPhrasesAndCount(
- $disallow_phrases[$i], 2, getLocaleTag()));
+ $disallow_stem = PhraseParser::extractPhrases(
+ $disallow_phrases[$i], getLocaleTag());
//stemmed
$disallow_keys[] = crawlHash($disallow_stem[0]);
}
@@ -592,9 +593,9 @@ class PhraseModel extends ParallelModel
/**
- * The plan is code toguess from the query what the user is
- * looking for will be called from here. For now, we are just guessing
- * when a query term is a url and rewriting it to the appropriate meta
+ * Idealistically, this function tries to guess from the query what the
+ * user is looking for. For now, we are just guessing when a query term
+ * is a url and rewriting it to the appropriate meta
* meta word.
*
* @param string $phrase input query to guess semantics of
@@ -989,19 +990,13 @@ class PhraseModel extends ParallelModel
$base_iterator = $word_iterators[0];
} else {
$base_iterator = new IntersectIterator(
- $word_iterators, $word_iterator_map);
- }
- if($restrict_phrases == NULL && $disallow_keys == array() &&
- $weight == 1) {
- $iterators[] = $base_iterator;
- } else {
- $iterators[] = new PhraseFilterIterator($base_iterator,
- $restrict_phrases, $weight);
+ $word_iterators, $word_iterator_map, $restrict_phrases,
+ $weight);
}
-
+ $iterators[] = $base_iterator;
}
}
- $num_iterators = count($iterators);
+ $num_iterators = count($iterators); //if network_flag should be 1
if( $num_iterators < 1) {
return NULL;