Add negation_iterators for query processing and modifies phrase model to use it, a=chris

Chris Pollett [2011-09-03 06:Sep:rd]
Add negation_iterators for query processing and modifies phrase model to use it, a=chris
Filename
bin/fetcher.php
configs/config.php
lib/index_bundle_iterators/intersect_iterator.php
lib/index_bundle_iterators/negation_iterator.php
lib/index_bundle_iterators/phrase_filter_iterator.php
lib/index_bundle_iterators/union_iterator.php
models/phrase_model.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 72d732d67..a723caf39 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -1568,6 +1568,7 @@ class Fetcher implements CrawlConstants
         }
         $meta_ids[] = 'info:'.$site[self::URL];
         $meta_ids[] = 'info:'.crawlHash($site[self::URL]);
+        $meta_ids[] = 'pages:all';

         foreach($site[self::IP_ADDRESSES] as $address) {
             $meta_ids[] = 'ip:'.$address;
diff --git a/configs/config.php b/configs/config.php
index dfc1b95fc..dcd7e8ca4 100755
--- a/configs/config.php
+++ b/configs/config.php
@@ -325,7 +325,7 @@ define ('EN_RATIO', 0.9);
 define ('AD_HOC_TITLE_LENGTH', 10);

 /** BM25F weight for title text */
-define ('TITLE_WEIGHT', 5);
+define ('TITLE_WEIGHT', 4);

 /** BM25F weight for other text within doc*/
 define ('DESCRIPTION_WEIGHT', 1);
diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php
index 2e9513fbd..6b80b6a48 100644
--- a/lib/index_bundle_iterators/intersect_iterator.php
+++ b/lib/index_bundle_iterators/intersect_iterator.php
@@ -61,13 +61,6 @@ class IntersectIterator extends IndexBundleIterator
      */
     var $num_iterators;

-    /**
-     * The number of documents in the current block after filtering
-     * by restricted words
-     * @var int
-     */
-    var $count_block;
-
     /**
      * The number of iterated docs before the restriction test
      * @var int
@@ -103,7 +96,6 @@ class IntersectIterator extends IndexBundleIterator
         */
         for($i = 0; $i < $this->num_iterators; $i++) {
             $this->num_docs += $this->index_bundle_iterators[$i]->num_docs;
-            $this->index_bundle_iterators[$i]->setResultsPerBlock(1);
         }
         $this->reset();
     }
@@ -150,7 +142,6 @@ class IntersectIterator extends IndexBundleIterator
      */
     function findDocsWithWord()
     {
-        $pages = array();
         $status = $this->syncGenDocOffsetsAmongstIterators();

         if($status == -1) {
@@ -255,7 +246,7 @@ class IntersectIterator extends IndexBundleIterator
     }

     /**
-     * Finds the next generation and doc offet amongst all the iterators
+     * Finds the next generation and doc offset amongst all the iterators
      * that contains the word. It assumes that the (generation, doc offset)
      * pairs are ordered in an increasing fashion for the underlying iterators
      */
@@ -263,7 +254,6 @@ class IntersectIterator extends IndexBundleIterator
     {
         $biggest_gen_offset = $this->index_bundle_iterators[
                         0]->currentGenDocOffsetWithWord();
-
         $all_same = true;
         for($i = 0; $i < $this->num_iterators; $i++) {
             $cur_gen_doc_offset =
@@ -287,10 +277,10 @@ class IntersectIterator extends IndexBundleIterator
         }
         $last_changed = -1;
         $i = 0;
-        while($i != $last_changed) {
+        while($i != $last_changed) {
             if($last_changed == -1) $last_changed = 0;
             if($this->genDocOffsetCmp($gen_doc_offset[$i],
-                $biggest_gen_offset) < 0) {
+                $biggest_gen_offset) < 0) {
                 $iterator = $this->index_bundle_iterators[$i];
                 $iterator->advance($biggest_gen_offset);
                 $cur_gen_doc_offset =
@@ -311,6 +301,7 @@ class IntersectIterator extends IndexBundleIterator
                 $i = 0;
             }
         }
+
         return 1;
     }

@@ -352,7 +343,7 @@ class IntersectIterator extends IndexBundleIterator
      */
     function currentGenDocOffsetWithWord() {
         $this->syncGenDocOffsetsAmongstIterators();
-        $this->index_bundle_iterators[0]->currentGenDocOffsetWithWord();
+        return $this->index_bundle_iterators[0]->currentGenDocOffsetWithWord();
     }

     /**
@@ -378,8 +369,10 @@ class IntersectIterator extends IndexBundleIterator
      *      a block
      */
      function setResultsPerBlock($num) {
-        trigger_error("Cannot set the results per block of
-            an intersect iterator", E_USER_ERROR);
+        if($num != 1) {
+            trigger_error("Cannot set the results per block of
+                an intersect iterator", E_USER_ERROR);
+        }
      }
 }
 ?>
diff --git a/lib/index_bundle_iterators/negation_iterator.php b/lib/index_bundle_iterators/negation_iterator.php
new file mode 100644
index 000000000..8c83b2a0e
--- /dev/null
+++ b/lib/index_bundle_iterators/negation_iterator.php
@@ -0,0 +1,255 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009, 2010, 2011 Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage iterator
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009, 2010, 2011
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+
+/**
+ *Loads base class for iterating
+ */
+require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php';
+
+/**
+ * Used to iterate over the documents which dont' occur in a set of
+ * iterator results
+ *
+ * @author Chris Pollett
+ * @package seek_quarry
+ * @subpackage iterator
+ * @see IndexArchiveBundle
+ */
+class NegationIterator extends IndexBundleIterator
+{
+    /**
+     * An array of iterators whose interection we  get documents from
+     * @var array
+     */
+    var $index_bundle_iterators;
+    /**
+     * Number of elements in $this->index_bundle_iterators
+     * @var int
+     */
+    var $num_iterators;
+    /**
+     * Index of the iterator amongst those we are intersecting to advance
+     * next
+     * @var int
+     */
+    var $to_advance_index;
+
+    /**
+     * Creates a negation iterator with the given parameters.
+     *
+     * @param object $index_bundle_iterator to use as a source of documents
+     *      to iterate over
+     */
+    function __construct($index_bundle_iterator)
+    {
+        $this->index_bundle_iterators[0] = new WordIterator(
+            crawlHash("pages:all", true),
+            $index_bundle_iterator->index, true,
+            $index_bundle_iterator->filter);
+        $this->index_bundle_iterators[1] = $index_bundle_iterator;
+
+        $this->num_iterators = 2;
+        $this->num_docs = 0;
+        $this->results_per_block = 1;
+
+        $this->num_docs = $this->index_bundle_iterators[0]->num_docs;
+
+        $this->reset();
+    }
+
+    /**
+     * Returns the iterators to the first document block that it could iterate
+     * over
+     */
+    function reset()
+    {
+        for($i = 0; $i < $this->num_iterators; $i++) {
+            $this->index_bundle_iterators[$i]->setResultsPerBlock(1);
+            $this->index_bundle_iterators[$i]->reset();
+        }
+
+        $this->seen_docs = 0;
+        $this->seen_docs_unfiltered = 0;
+
+    }
+
+    /**
+     * Computes a relevancy score for a posting offset with respect to this
+     * iterator and generation
+     * @param int $generation the generation the posting offset is for
+     * @param int $posting_offset an offset into word_docs to compute the
+     *      relevance of
+     * @return float a relevancy score based on BM25F.
+     */
+    function computeRelevance($generation, $posting_offset)
+    {
+        return 1;
+    }
+
+    /**
+     * Hook function used by currentDocsWithWord to return the current block
+     * of docs if it is not cached
+     *
+     * @return mixed doc ids and rank if there are docs left, -1 otherwise
+     */
+    function findDocsWithWord()
+    {
+
+        $status = $this->syncGenDocOffsetsAmongstIterators();
+
+        if($status == -1) {
+            return -1;
+        }
+        //next we finish computing BM25F
+        $docs = $this->index_bundle_iterators[0]->currentDocsWithWord();
+
+        if(is_array($docs) && count($docs) == 1) {
+            //we get intersect docs one at a time so should be only one
+            $keys = array_keys($docs);
+            $key = $keys[0];
+
+            $docs[$key][self::RELEVANCE] = 1;
+            $docs[$key][self::PROXIMITY] = 1;
+
+            $docs[$key][self::SCORE] = $docs[$key][self::DOC_RANK] *
+                 $docs[$key][self::RELEVANCE] * $docs[$key][self::PROXIMITY];
+        }
+        $this->count_block = count($docs);
+        $this->pages = $docs;
+        return $docs;
+    }
+
+
+    /**
+     * Finds the next generation and doc offset amongst the all docs iterator
+     * and the term to be negated iterator such that the all iterator is
+     * strictly less than the term iterator.
+     */
+    function syncGenDocOffsetsAmongstIterators()
+    {
+        $changed_term = false;
+        $changed_all = false;
+        do {
+            $gen_offset_all = $this->index_bundle_iterators[
+                0]->currentGenDocOffsetWithWord();
+            if($gen_offset_all == -1 || ($changed_all &&
+                $this->genDocOffsetCmp($gen_offset_all,
+                $old_gen_offset_all) == 0)) {
+                return -1;
+            }
+            $gen_offset_term =
+                $this->index_bundle_iterators[
+                    1]->currentGenDocOffsetWithWord();
+            if($gen_offset_term == -1 || ($changed_term &&
+                $this->genDocOffsetCmp($gen_offset_term,
+                $old_gen_offset_term) == 0)) {
+                return -1;
+            }
+            $gen_doc_cmp = $this->genDocOffsetCmp($gen_offset_all,
+                $gen_offset_term);
+            if($gen_doc_cmp > 0) {
+                $this->index_bundle_iterators[1]->advance($gen_offset_all);
+                $old_gen_offset_term = $gen_offset_term;
+                $changed_term = true;
+                $changed_all = false;
+            } else if($gen_doc_cmp == 0) {
+                $this->index_bundle_iterators[0]->advance($gen_offset_term);
+                $old_gen_offset_all = $gen_offset_all;
+                $changed_term = false;
+                $changed_all = true;
+            }
+        } while($gen_doc_cmp >= 0);
+
+        return 1;
+    }
+
+    /**
+     * Forwards the iterator one group of docs
+     * @param array $gen_doc_offset a generation, doc_offset pair. If set,
+     *      the must be of greater than or equal generation, and if equal the
+     *      next block must all have $doc_offsets larger than or equal to
+     *      this value
+     */
+    function advance($gen_doc_offset = null)
+    {
+        $this->advanceSeenDocs();
+
+        $this->index_bundle_iterators[0]->advance($gen_doc_offset);
+
+    }
+
+    /**
+     * Gets the doc_offset and generation for the next document that
+     * would be return by this iterator
+     *
+     * @return mixed an array with the desired document offset
+     *  and generation; -1 on fail
+     */
+    function currentGenDocOffsetWithWord() {
+        $this->syncGenDocOffsetsAmongstIterators();
+        return $this->index_bundle_iterators[0]->currentGenDocOffsetWithWord();
+    }
+
+    /**
+     * Returns the index associated with this iterator
+     * @return object the index
+     */
+    function getIndex($key = NULL)
+    {
+        return $this->index_bundle_iterators[0]->getIndex($key = NULL);
+    }
+
+    /**
+     * This method is supposed to set
+     * the value of the result_per_block field. This field controls
+     * the maximum number of results that can be returned in one go by
+     * currentDocsWithWord(). This method cannot be consistently
+     * implemented for this iterator and expect it to behave nicely
+     * it this iterator is used together with union_iterator. So
+     * to prevent a user for doing this, calling this method results
+     * in a user defined error
+     *
+     * @param int $num the maximum number of results that can be returned by
+     *      a block
+     */
+     function setResultsPerBlock($num) {
+        if($num != 1) {
+            trigger_error("Cannot set the results per block of
+                a negation iterator", E_USER_ERROR);
+        }
+     }
+}
+?>
diff --git a/lib/index_bundle_iterators/phrase_filter_iterator.php b/lib/index_bundle_iterators/phrase_filter_iterator.php
index c2fcc05dd..b1ba81c93 100644
--- a/lib/index_bundle_iterators/phrase_filter_iterator.php
+++ b/lib/index_bundle_iterators/phrase_filter_iterator.php
@@ -40,14 +40,12 @@ require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php';

 /**
  * Used to iterate through a collection of documents to return only those
- * which have certain restricted_phrases and don't have disallowed_phrases.
+ * which have certain restricted_phrases.
  *
  * For restricted_phrases a string like "Chris * Homepage" will match any
  * string where * has been replace by any other string. So for example it will
  * match Chris Pollett's Homepage.
  *
- * disallowed_phrases are really just disallowed words and must be an exact
- * match
  *
  * @author Chris Pollett
  * @package seek_quarry
@@ -69,12 +67,6 @@ class PhraseFilterIterator extends IndexBundleIterator
      */
     var $restrict_phrases;

-    /**
-     * This iterator returns only documents not containing any the elements of
-     * disallow phrases
-     * @var array
-     */
-    var $disallow_phrases;
     /**
      * The number of documents in the current block before filtering
      * by restricted words
@@ -109,18 +101,14 @@ class PhraseFilterIterator extends IndexBundleIterator
      * @param array $restrict_phrases this iterator returns only documents from
      *      $index_bundle_iterator containing all the elements of restrict
      *      phrases
-     * @param array $disallow_phrases this iterator returns only documents from
-     *      $index_bundle_iterator not containing any of the words in disallow
-     *      phrases
      * @param float $weight a quantity to multiply each score returned from
      *      this iterator with
      */
     function __construct($index_bundle_iterator, $restrict_phrases,
-        $disallow_phrases, $weight = 1)
+        $weight = 1)
     {
         $this->index_bundle_iterator = $index_bundle_iterator;
         $this->restrict_phrases = $restrict_phrases;
-        $this->disallow_phrases = $disallow_phrases;
         $this->num_docs = $this->index_bundle_iterator->num_docs;
         $this->results_per_block =
             $this->index_bundle_iterator->results_per_block;
@@ -199,15 +187,7 @@ class PhraseFilterIterator extends IndexBundleIterator
                             }
                         }
                     }
-                    if($this->disallow_phrases != NULL &&
-                        is_array($this->disallow_phrases)) {
-                        foreach($this->disallow_phrases as $phrase) {
-                            if(strlen($phrase) > 0 &&
-                                mb_eregi($phrase, $page_string)  !== false) {
-                                $found = false;
-                            }
-                        }
-                    }
+
                     if($found == true) {
                         $doc_info["WEIGHT"] = $this->weight;
                         $doc_info[self::SCORE] *= $this->weight;
diff --git a/lib/index_bundle_iterators/union_iterator.php b/lib/index_bundle_iterators/union_iterator.php
index 695a9ce36..de0f54f08 100644
--- a/lib/index_bundle_iterators/union_iterator.php
+++ b/lib/index_bundle_iterators/union_iterator.php
@@ -65,20 +65,12 @@ class UnionIterator extends IndexBundleIterator
      * @var int
      */
     var $num_iterators;
-
     /**
      * The number of documents in the current block before filtering
      * by restricted words
      * @var int
      */
     var $count_block_unfiltered;
-    /**
-     * The number of documents in the current block after filtering
-     * by restricted words
-     * @var int
-     */
-    var $count_block;
-
     /**
      * The number of iterated docs before the restriction test
      * @var int
diff --git a/models/phrase_model.php b/models/phrase_model.php
index 114264b03..8b8016bab 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -41,7 +41,10 @@ if(!defined("POST_PROCESSING")) {
     define("LOG_TO_FILES", false);
 }
 /** For crawlHash function */
-require_once BASE_DIR."/lib/utility.php";
+require_once BASE_DIR."/lib/utility.php";
+/** For extractPhrasesAndCount function */
+require_once BASE_DIR."/lib/phrase_parser.php";
+
 /**
  * Used to look up words and phrases in the inverted index
  * associated with a given crawl
@@ -454,7 +457,7 @@ class PhraseModel extends Model
         $meta_words = array('link:', 'site:', 'version:', 'modified:',
             'filetype:', 'info:', '\-', 'os:', 'server:', 'date:',
             'index:', 'i:', 'ip:', 'weight:', 'w:', 'u:',
-            'lang:', 'media:', 'elink:');
+            'lang:', 'media:', 'elink:', 'pages:');
         if(isset($this->additional_meta_words)) {
             $meta_words = array_merge($meta_words, array_keys(
                 $this->additional_meta_words));
@@ -523,7 +526,7 @@ class PhraseModel extends Model
             $phrase_string = $words[0];
             $phrase_hash = crawlHash($phrase_string);
             $word_struct = array("KEYS" => array($phrase_hash),
-                "RESTRICT_PHRASES" => NULL, "DISALLOW_PHRASES" => NULL,
+                "RESTRICT_PHRASES" => NULL, "DISALLOW_KEYS" => array(),
                 "WEIGHT" => $weight, "INDEX_ARCHIVE" => $index_archive
             );
         } else {
@@ -543,8 +546,7 @@ class PhraseModel extends Model

             $hashes = array();
             foreach($words as $word) {
-                $tmp = crawlHash($word);
-                $hashes[] = $tmp;
+                $hashes[] = crawlHash($word);
             }

             $restrict_phrases = $quoteds;
@@ -560,11 +562,19 @@ class PhraseModel extends Model
                 $word_keys = NULL;
                 $word_struct = NULL;
             }
+            $disallow_keys = array();
+            $num_disallow_keys = min(5, count($disallow_phrases));
+            for($i = 0; $i < $num_disallow_keys; $i++) {
+                $disallow_stem=array_keys(PhraseParser::extractPhrasesAndCount(
+                    $disallow_phrases[$i], 2, getLocaleTag()));
+                        //stemmed
+                $disallow_keys[] = crawlHash($disallow_stem[0]);
+            }

             if($word_keys !== NULL) {
                 $word_struct = array("KEYS" => $word_keys,
                     "RESTRICT_PHRASES" => $restrict_phrases,
-                    "DISALLOW_PHRASES" => $disallow_phrases,
+                    "DISALLOW_KEYS" => $disallow_keys,
                     "WEIGHT" => $weight,
                     "INDEX_ARCHIVE" => $index_archive
                 );
@@ -650,7 +660,7 @@ class PhraseModel extends Model
             foreach($word_structs as $word_struct) {
                 $mem_tmp .= serialize($word_struct["KEYS"]).
                     serialize($word_struct["RESTRICT_PHRASES"]) .
-                    serialize($word_struct["DISALLOW_PHRASES"]) .
+                    serialize($word_struct["DISALLOW_KEYS"]) .
                     $word_struct["WEIGHT"] .
                     $word_struct["INDEX_ARCHIVE"]->dir_name;
             }
@@ -755,7 +765,7 @@ class PhraseModel extends Model
             if(!is_array($word_struct)) { continue;}
             $word_keys = $word_struct["KEYS"];
             $restrict_phrases = $word_struct["RESTRICT_PHRASES"];
-            $disallow_phrases = $word_struct["DISALLOW_PHRASES"];
+            $disallow_keys = $word_struct["DISALLOW_KEYS"];
             $index_archive = $word_struct["INDEX_ARCHIVE"];

             $weight = $word_struct["WEIGHT"];
@@ -769,17 +779,29 @@ class PhraseModel extends Model
                     new WordIterator($word_keys[$i], $index_archive,
                         false, $filter);
             }
+            $num_disallow_keys = count($disallow_keys);
+            if($num_disallow_keys > 0) {
+            for($i = 0; $i < $num_disallow_keys; $i++) {
+                    $disallow_iterator =
+                        new WordIterator($disallow_keys[$i], $index_archive,
+                            false, $filter);
+                    $word_iterators[$num_word_keys + $i] =
+                        new NegationIterator($disallow_iterator);
+                }
+            }
+            $num_word_keys += $num_disallow_keys;
+
             if($num_word_keys == 1) {
                 $base_iterator = $word_iterators[0];
             } else {
                 $base_iterator = new IntersectIterator($word_iterators);
             }
-            if($restrict_phrases == NULL && $disallow_phrases == NULL &&
+            if($restrict_phrases == NULL && $disallow_keys == array() &&
                 $weight == 1) {
                 $iterators[] = $base_iterator;
             } else {
                 $iterators[] = new PhraseFilterIterator($base_iterator,
-                    $restrict_phrases, $disallow_phrases, $weight);
+                    $restrict_phrases, $weight);
             }

         }
ViewGit