adding files I forgot to add, a=chris
adding files I forgot to add, a=chris
diff --git a/lib/index_bundle_iterators/disjoint_iterator.php b/lib/index_bundle_iterators/disjoint_iterator.php
new file mode 100644
index 0000000..56dd545
--- /dev/null
+++ b/lib/index_bundle_iterators/disjoint_iterator.php
@@ -0,0 +1,257 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage iterator
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2013
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+
+/**
+ *Loads base class for iterating
+ */
+require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php';
+
+/**
+ * Used to iterate over the documents which occur in a set of disjoint iterators
+ * all belonging to the same index
+ *
+ * @author Chris Pollett
+ * @package seek_quarry
+ * @subpackage iterator
+ * @see IndexArchiveBundle
+ */
+class DisjointIterator extends IndexBundleIterator
+{
+ /**
+ * An array of iterators whose intersection we get documents from
+ * @var array
+ */
+ var $index_bundle_iterators;
+ /**
+ * Number of elements in $this->index_bundle_iterators
+ * @var int
+ */
+ var $num_iterators;
+
+ /**
+ * The number of iterated docs before the restriction test
+ * @var int
+ */
+ var $seen_docs_unfiltered;
+
+ /**
+ * Index of the iterator amongst those we are disjoint unioning of
+ * least gen_doc_offset
+ * @var int
+ */
+ var $least_offset_index;
+
+ /**
+ * Creates an disjoint union iterator with the given parameters.
+ *
+ * @param object $index_bundle_iterator to use as a source of documents
+ * to iterate over
+ */
+ function __construct($index_bundle_iterators)
+ {
+ $this->index_bundle_iterators = $index_bundle_iterators;
+ $this->num_iterators = count($index_bundle_iterators);
+ $this->num_docs = 0;
+ $this->results_per_block = 1;
+
+ /*
+ We take an initial guess of the num_docs we return as the sum
+ of the num_docs of the underlying iterators. We are also setting
+ up here that we return at most one posting at a time from each
+ iterator
+ */
+ $this->seen_docs = 0;
+ $this->seen_docs_unfiltered = 0;
+ for($i = 0; $i < $this->num_iterators; $i++) {
+ $this->num_docs += $this->index_bundle_iterators[$i]->num_docs;
+ $this->index_bundle_iterators[$i]->setResultsPerBlock(1);
+ $this->seen_docs += $this->index_bundle_iterators[$i]->seen_docs;
+ if(isset($this->index_bundle_iterators[$i]->seen_docs_unfiltered)) {
+ $this->seen_docs_unfiltered +=
+ $this->index_bundle_iterators[$i]->seen_docs_unfiltered;
+ } else {
+ $this->seen_docs_unfiltered += $this->seen_docs;
+ }
+ }
+ $this->leastGenDocOffsetsAmongstIterators();
+ }
+
+ /**
+ * Returns the iterators to the first document block that it could iterate
+ * over
+ */
+ function reset()
+ {
+ for($i = 0; $i < $this->num_iterators; $i++) {
+ $this->index_bundle_iterators[$i]->setResultsPerBlock(1);
+ $this->index_bundle_iterators[$i]->reset();
+ }
+
+ $this->seen_docs = 0;
+ $this->seen_docs_unfiltered = 0;
+ $this->leastGenDocOffsetsAmongstIterators();
+ }
+
+ /**
+ * Computes a relevancy score for a posting offset with respect to this
+ * iterator and generation
+ * @param int $generation the generation the posting offset is for
+ * @param int $posting_offset an offset into word_docs to compute the
+ * relevance of
+ * @return float a relevancy score based on BM25F.
+ */
+ function computeRelevance($generation, $posting_offset)
+ {
+ if(!$this->current_block_fresh) {
+ $docs = $this->currentDocsWithWord();
+ }
+ $this->index_bundle_iterators[
+ $this->least_offset_index]->computeRelevance(
+ $generation, $posting_offset);
+ }
+
+ /**
+ * Hook function used by currentDocsWithWord to return the current block
+ * of docs if it is not cached
+ *
+ * @return mixed doc ids and rank if there are docs left, -1 otherwise
+ */
+ function findDocsWithWord()
+ {
+ $least_offset = $this->leastGenDocOffsetsAmongstIterators();
+ if($least_offset == -1) {
+ return -1;
+ }
+ //next we finish computing BM25F
+ $docs = $this->index_bundle_iterators[
+ $this->least_offset_index]->currentDocsWithWord();
+ $this->count_block = count($docs);
+ $this->pages = $docs;
+ return $docs;
+ }
+
+ /**
+ * Gets the doc_offset and generation for the next document that
+ * would be return by this iterator
+ *
+ * @return mixed an array with the desired document offset
+ * and generation; -1 on fail
+ */
+ function currentGenDocOffsetWithWord() {
+ $this->leastGenDocOffsetsAmongstIterators();
+ return $this->index_bundle_iterators[$this->least_offset_index
+ ]->currentGenDocOffsetWithWord();
+ }
+
+ /**
+ * Finds the next generation and doc offset amongst all the iterators
+ * that is of least value
+ */
+ function leastGenDocOffsetsAmongstIterators()
+ {
+ $least_gen_offset = -1;
+ $this->least_offset_index = 0;
+ for($i = 0; $i < $this->num_iterators; $i++) {
+ $cur_gen_doc_offset =
+ $this->index_bundle_iterators[
+ $i]->currentGenDocOffsetWithWord();
+ if($least_gen_offset == -1 && is_array($cur_gen_doc_offset)) {
+ $least_gen_offset = $cur_gen_doc_offset;
+ $this->least_offset_index = $i;
+ continue;
+ } else if ($cur_gen_doc_offset == -1) {
+ continue;
+ }
+ $gen_doc_cmp = $this->genDocOffsetCmp($cur_gen_doc_offset,
+ $least_gen_offset);
+ if($gen_doc_cmp < 0) {
+ $least_gen_offset = $cur_gen_doc_offset;
+ $this->least_offset_index = $i;
+ }
+ }
+ return $least_gen_offset;
+ }
+
+ /**
+ * Forwards the iterator one group of docs
+ * @param array $gen_doc_offset a generation, doc_offset pair. If set,
+ * the must be of greater than or equal generation, and if equal the
+ * next block must all have $doc_offsets larger than or equal to
+ * this value
+ */
+ function advance($gen_doc_offset = null)
+ {
+ $this->current_block_fresh = false;
+ $this->seen_docs += 1;
+
+ $this->seen_docs_unfiltered = 0;
+
+ //num_docs can change when advance() called so that's why we recompute
+ $total_num_docs = 0;
+ for($i = 0; $i < $this->num_iterators; $i++) {
+ $this->seen_docs_unfiltered +=
+ $this->index_bundle_iterators[$i]->seen_docs;
+ $total_num_docs += $this->index_bundle_iterators[$i]->num_docs;
+ $this->index_bundle_iterators[$i]->advance($gen_doc_offset);
+ }
+ if($this->seen_docs_unfiltered > 0) {
+ $this->num_docs =
+ floor(($this->seen_docs * $total_num_docs) /
+ $this->seen_docs_unfiltered);
+ }
+ }
+
+ /**
+ * This method is supposed to set
+ * the value of the result_per_block field. This field controls
+ * the maximum number of results that can be returned in one go by
+ * currentDocsWithWord(). This method cannot be consistently
+ * implemented for this iterator and expect it to behave nicely
+ * it this iterator is used together with union_iterator or
+ * intersect_iterator. So to prevent a user for doing this, calling this
+ * method results in a user defined error
+ *
+ * @param int $num the maximum number of results that can be returned by
+ * a block
+ */
+ function setResultsPerBlock($num) {
+ if($num != 1) {
+ trigger_error("Cannot set the results per block of
+ a phrase iterator", E_USER_ERROR);
+ }
+ }
+}
+?>
diff --git a/tests/it_tokenizer_test.php b/tests/it_tokenizer_test.php
new file mode 100644
index 0000000..1f9bad1
--- /dev/null
+++ b/tests/it_tokenizer_test.php
@@ -0,0 +1,101 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage test
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2013
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/**
+ * Load the Italian Tokenizer via phrase_parser (5.4 hack)
+ */
+require_once BASE_DIR."/lib/phrase_parser.php";
+/**
+ * Load the run function
+ */
+require_once BASE_DIR.'lib/unit_test.php';
+
+/**
+ * My code for testing the Italian stemming algorithm. The inputs for the
+ * algorithm are words in
+ * http://snowball.tartarus.org/algorithms/italian/voc.txt and the resulting
+ * stems are compared with the stem words in
+ * http://snowball.tartarus.org/algorithms/italian/output.txt
+ *
+ * @author Akshat Kukreti
+ * @package seek_quarry
+ * @subpackage test
+ */
+
+class ItTokenizerTest extends UnitTest
+{
+ function setUp()
+ {
+ $this->test_objects['FILE1'] = new ItTokenizer();
+ }
+
+ function tearDown()
+ {
+ }
+
+ /**
+ * Tests whether the stem funtion for the Italian stemming algorithm
+ * stems words according to the rules of stemming. The function tests stem
+ * by calling stem with the words in $test_words and compares the results
+ * with the stem words in $stem_words
+ *
+ * $test_words is an array containing a set of words in Italian provided in
+ * the snowball web page
+ * $stem_words is an array containing the stems for words in $test_words
+ */
+ function stemmerTestCase()
+ {
+ $stem_dir = BASE_DIR.'/tests/test_files/italian_stemmer';
+
+ //Test word set from snowball
+ $test_words = file("$stem_dir/input_vocabulary.txt");
+ //Stem word set from snowball for comparing results
+ $stem_words = file("$stem_dir/stemmed_result.txt");
+
+ /**
+ * check if function stem correctly stems the words in $test_words by
+ * comparing results with stem words in $stem_words
+ */
+ for($i = 0; $i < count($test_words); $i++){
+ $word = trim($test_words[$i]);
+ $stem = trim($stem_words[$i]);
+ $this->assertEqual(
+ $this->test_objects['FILE1']->stem($word),
+ $stem,"function stem correctly stems
+ $word to $stem");
+ }
+ }
+}
+?>