adding files I forgot to add, a=chris

Chris Pollett [2013-05-20 22:34:03]
adding files I forgot to add, a=chris
Filename
lib/index_bundle_iterators/disjoint_iterator.php
tests/it_tokenizer_test.php
diff --git a/lib/index_bundle_iterators/disjoint_iterator.php b/lib/index_bundle_iterators/disjoint_iterator.php
new file mode 100644
index 0000000..56dd545
--- /dev/null
+++ b/lib/index_bundle_iterators/disjoint_iterator.php
@@ -0,0 +1,257 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage iterator
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2013
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+
+/**
+ *Loads base class for iterating
+ */
+require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php';
+
+/**
+ * Used to iterate over the documents which occur in a set of disjoint iterators
+ * all belonging to the same index
+ *
+ * @author Chris Pollett
+ * @package seek_quarry
+ * @subpackage iterator
+ * @see IndexArchiveBundle
+ */
+class DisjointIterator extends IndexBundleIterator
+{
+    /**
+     * An array of iterators whose intersection we  get documents from
+     * @var array
+     */
+    var $index_bundle_iterators;
+    /**
+     * Number of elements in $this->index_bundle_iterators
+     * @var int
+     */
+    var $num_iterators;
+
+    /**
+     * The number of iterated docs before the restriction test
+     * @var int
+     */
+    var $seen_docs_unfiltered;
+
+    /**
+     * Index of the iterator amongst those we are disjoint unioning of
+     * least gen_doc_offset
+     * @var int
+     */
+    var $least_offset_index;
+
+    /**
+     * Creates an disjoint union iterator with the given parameters.
+     *
+     * @param object $index_bundle_iterator to use as a source of documents
+     *      to iterate over
+     */
+    function __construct($index_bundle_iterators)
+    {
+        $this->index_bundle_iterators = $index_bundle_iterators;
+        $this->num_iterators = count($index_bundle_iterators);
+        $this->num_docs = 0;
+        $this->results_per_block = 1;
+
+        /*
+             We take an initial guess of the num_docs we return as the sum
+             of the num_docs of the underlying iterators. We are also setting
+             up here that we return at most one posting at a time from each
+             iterator
+        */
+        $this->seen_docs = 0;
+        $this->seen_docs_unfiltered = 0;
+        for($i = 0; $i < $this->num_iterators; $i++) {
+            $this->num_docs += $this->index_bundle_iterators[$i]->num_docs;
+            $this->index_bundle_iterators[$i]->setResultsPerBlock(1);
+            $this->seen_docs += $this->index_bundle_iterators[$i]->seen_docs;
+            if(isset($this->index_bundle_iterators[$i]->seen_docs_unfiltered)) {
+                $this->seen_docs_unfiltered +=
+                    $this->index_bundle_iterators[$i]->seen_docs_unfiltered;
+            } else {
+                $this->seen_docs_unfiltered += $this->seen_docs;
+            }
+        }
+        $this->leastGenDocOffsetsAmongstIterators();
+    }
+
+    /**
+     * Returns the iterators to the first document block that it could iterate
+     * over
+     */
+    function reset()
+    {
+        for($i = 0; $i < $this->num_iterators; $i++) {
+            $this->index_bundle_iterators[$i]->setResultsPerBlock(1);
+            $this->index_bundle_iterators[$i]->reset();
+        }
+
+        $this->seen_docs = 0;
+        $this->seen_docs_unfiltered = 0;
+        $this->leastGenDocOffsetsAmongstIterators();
+    }
+
+    /**
+     * Computes a relevancy score for a posting offset with respect to this
+     * iterator and generation
+     * @param int $generation the generation the posting offset is for
+     * @param int $posting_offset an offset into word_docs to compute the
+     *      relevance of
+     * @return float a relevancy score based on BM25F.
+     */
+    function computeRelevance($generation, $posting_offset)
+    {
+        if(!$this->current_block_fresh) {
+            $docs = $this->currentDocsWithWord();
+        }
+        $this->index_bundle_iterators[
+            $this->least_offset_index]->computeRelevance(
+                $generation, $posting_offset);
+    }
+
+    /**
+     * Hook function used by currentDocsWithWord to return the current block
+     * of docs if it is not cached
+     *
+     * @return mixed doc ids and rank if there are docs left, -1 otherwise
+     */
+    function findDocsWithWord()
+    {
+        $least_offset = $this->leastGenDocOffsetsAmongstIterators();
+        if($least_offset == -1) {
+            return -1;
+        }
+        //next we finish computing BM25F
+        $docs = $this->index_bundle_iterators[
+            $this->least_offset_index]->currentDocsWithWord();
+        $this->count_block = count($docs);
+        $this->pages = $docs;
+        return $docs;
+    }
+
+    /**
+     * Gets the doc_offset and generation for the next document that
+     * would be return by this iterator
+     *
+     * @return mixed an array with the desired document offset
+     *  and generation; -1 on fail
+     */
+    function currentGenDocOffsetWithWord() {
+        $this->leastGenDocOffsetsAmongstIterators();
+        return $this->index_bundle_iterators[$this->least_offset_index
+            ]->currentGenDocOffsetWithWord();
+    }
+
+    /**
+     * Finds the next generation and doc offset amongst all the iterators
+     * that is of least value
+     */
+    function leastGenDocOffsetsAmongstIterators()
+    {
+        $least_gen_offset = -1;
+        $this->least_offset_index = 0;
+        for($i = 0; $i < $this->num_iterators; $i++) {
+            $cur_gen_doc_offset =
+                $this->index_bundle_iterators[
+                    $i]->currentGenDocOffsetWithWord();
+            if($least_gen_offset == -1 && is_array($cur_gen_doc_offset)) {
+                $least_gen_offset = $cur_gen_doc_offset;
+                $this->least_offset_index = $i;
+                continue;
+            } else if ($cur_gen_doc_offset == -1) {
+                continue;
+            }
+            $gen_doc_cmp = $this->genDocOffsetCmp($cur_gen_doc_offset,
+                $least_gen_offset);
+            if($gen_doc_cmp < 0) {
+                $least_gen_offset = $cur_gen_doc_offset;
+                $this->least_offset_index = $i;
+            }
+        }
+        return $least_gen_offset;
+    }
+
+    /**
+     * Forwards the iterator one group of docs
+     * @param array $gen_doc_offset a generation, doc_offset pair. If set,
+     *      the must be of greater than or equal generation, and if equal the
+     *      next block must all have $doc_offsets larger than or equal to
+     *      this value
+     */
+    function advance($gen_doc_offset = null)
+    {
+        $this->current_block_fresh = false;
+        $this->seen_docs += 1;
+
+        $this->seen_docs_unfiltered = 0;
+
+        //num_docs can change when advance() called so that's why we recompute
+        $total_num_docs = 0;
+        for($i = 0; $i < $this->num_iterators; $i++) {
+            $this->seen_docs_unfiltered +=
+                $this->index_bundle_iterators[$i]->seen_docs;
+            $total_num_docs += $this->index_bundle_iterators[$i]->num_docs;
+            $this->index_bundle_iterators[$i]->advance($gen_doc_offset);
+        }
+        if($this->seen_docs_unfiltered > 0) {
+            $this->num_docs =
+                floor(($this->seen_docs * $total_num_docs) /
+                $this->seen_docs_unfiltered);
+        }
+    }
+
+    /**
+     * This method is supposed to set
+     * the value of the result_per_block field. This field controls
+     * the maximum number of results that can be returned in one go by
+     * currentDocsWithWord(). This method cannot be consistently
+     * implemented for this iterator and expect it to behave nicely
+     * it this iterator is used together with union_iterator or
+     * intersect_iterator. So to prevent a user for doing this, calling this
+     * method results in a user defined error
+     *
+     * @param int $num the maximum number of results that can be returned by
+     *      a block
+     */
+     function setResultsPerBlock($num) {
+        if($num != 1) {
+            trigger_error("Cannot set the results per block of
+                a phrase iterator", E_USER_ERROR);
+        }
+     }
+}
+?>
diff --git a/tests/it_tokenizer_test.php b/tests/it_tokenizer_test.php
new file mode 100644
index 0000000..1f9bad1
--- /dev/null
+++ b/tests/it_tokenizer_test.php
@@ -0,0 +1,101 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009 - 2013  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage test
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2013
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/**
+ *  Load the Italian Tokenizer via phrase_parser (5.4 hack)
+ */
+require_once BASE_DIR."/lib/phrase_parser.php";
+/**
+ *  Load the run function
+ */
+require_once BASE_DIR.'lib/unit_test.php';
+
+/**
+ * My code for testing the Italian stemming algorithm. The inputs for the
+ * algorithm are words in
+ * http://snowball.tartarus.org/algorithms/italian/voc.txt and the resulting
+ * stems are compared with the stem words in
+ * http://snowball.tartarus.org/algorithms/italian/output.txt
+ *
+ * @author Akshat Kukreti
+ * @package seek_quarry
+ * @subpackage test
+ */
+
+class ItTokenizerTest extends UnitTest
+{
+    function setUp()
+    {
+        $this->test_objects['FILE1'] = new ItTokenizer();
+    }
+
+    function tearDown()
+    {
+    }
+
+    /**
+     * Tests whether the stem funtion for the Italian stemming algorithm
+     * stems words according to the rules of stemming. The function tests stem
+     * by calling stem with the words in $test_words and compares the results
+     * with the stem words in $stem_words
+     *
+     * $test_words is an array containing a set of words in Italian provided in
+     * the snowball web page
+     * $stem_words is an array containing the stems for words in $test_words
+     */
+    function stemmerTestCase()
+    {
+        $stem_dir = BASE_DIR.'/tests/test_files/italian_stemmer';
+
+        //Test word set from snowball
+        $test_words = file("$stem_dir/input_vocabulary.txt");
+        //Stem word set from snowball for comparing results
+        $stem_words = file("$stem_dir/stemmed_result.txt");
+
+        /**
+         * check if function stem correctly stems the words in $test_words by
+         * comparing results with stem words in $stem_words
+         */
+        for($i = 0; $i < count($test_words); $i++){
+            $word = trim($test_words[$i]);
+            $stem = trim($stem_words[$i]);
+            $this->assertEqual(
+                $this->test_objects['FILE1']->stem($word),
+                    $stem,"function stem correctly stems
+                    $word to $stem");
+        }
+    }
+}
+?>
ViewGit