fixes a bug in word_iterators advance method and another modifies sitemap processor to not deal with gz files as cant handle them yet, a=chris

Chris Pollett [2011-01-10 06:Jan:th]
fixes a bug in word_iterators advance method and another modifies sitemap processor to not deal with gz files as cant handle them yet, a=chris
Filename
lib/index_bundle_iterators/group_iterator.php
lib/index_bundle_iterators/intersect_iterator.php
lib/index_bundle_iterators/word_iterator.php
lib/processors/sitemap_processor.php
models/phrase_model.php
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index fcc4da265..d7f5dc8af 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -186,6 +186,7 @@ class GroupIterator extends IndexBundleIterator
             $pages = $this->computeBoostAndOutPages($pre_out_pages);
         }
         $this->pages = $pages;
+
         return $pages;

     }
diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php
index a3819b39f..d68f5d151 100644
--- a/lib/index_bundle_iterators/intersect_iterator.php
+++ b/lib/index_bundle_iterators/intersect_iterator.php
@@ -33,11 +33,6 @@

 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

-/**
- *Loads BloomFilterFile to remember things we've already grouped
- */
-require_once BASE_DIR.'/lib/bloom_filter_file.php';
-

 /**
  *Loads base class for iterating
@@ -185,13 +180,15 @@ class IntersectIterator extends IndexBundleIterator
     }

     /**
-     *
+     * Finds the next generation and doc offet amongst all the iterators
+     * that contains the word. It assumes that the (generation, doc offset)
+     * pairs are ordered in an increasing fashion for the underlying iterators
      */
     function syncGenDocOffsetsAmongstIterators()
     {
         $biggest_gen_offset = NULL;
         do{
-            $all_same = true;
+            $all_same = true;
             for($i = 0; $i < $this->num_iterators; $i++) {
                 $new_gen_doc_offset[$i] =
                     $this->index_bundle_iterators[
diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php
index 8932663da..2c7f686ea 100644
--- a/lib/index_bundle_iterators/word_iterator.php
+++ b/lib/index_bundle_iterators/word_iterator.php
@@ -230,6 +230,7 @@ class WordIterator extends IndexBundleIterator
             $this->start_offset,
             $this->next_offset, $this->last_offset, $this->results_per_block);
         $this->count_block = count($results);
+
         return $results;
     }

@@ -255,7 +256,9 @@ class WordIterator extends IndexBundleIterator
                       $last_current_generation != $this->current_generation) {
                     $this->advanceGeneration();
                     $last_current_generation = $this->current_generation;
+                    $this->next_offset = $this->current_offset;
                 }
+
                 $this->index->setCurrentShard($this->current_generation, true);

                 $this->current_offset =
@@ -276,7 +279,8 @@ class WordIterator extends IndexBundleIterator
     }

     /**
-     *
+     * Switches which index shard is being used to return occurences of
+     * the nord to the next shard containing the word
      */
     function advanceGeneration()
     {
diff --git a/lib/processors/sitemap_processor.php b/lib/processors/sitemap_processor.php
index 0770bda85..568e43cc1 100644
--- a/lib/processors/sitemap_processor.php
+++ b/lib/processors/sitemap_processor.php
@@ -137,7 +137,9 @@ class SitemapProcessor extends TextProcessor
                 $url = UrlParser::canonicalLink(
                     $node->textContent, $site);
                 if($url === NULL || $url === "" ||
-                    UrlParser::checkRecursiveUrl($url)) {
+                    UrlParser::checkRecursiveUrl($url) ||
+                    UrlParser::getDocumentType($url) == "gz") {
+                    //at this point we can't handle gzip'd sitemaps
                     continue;
                 }
                 $sites[$url] = "From sitemap of ".$site;
diff --git a/models/phrase_model.php b/models/phrase_model.php
index 6b6857add..7d93ff99c 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -339,15 +339,13 @@ class PhraseModel extends Model
             if(is_array($words_array)) {
                 $counts = array_values($words_array);
                 $min_count = min($counts);
-                $threshold = 3*$min_count;
+                $threshold = 4000*$min_count;
                 $word_keys = array();
                 foreach($words_array as $key => $count) {
                     if($count < $threshold) {
                         $word_keys[] = $key;
                     }
                 }
-                $word_keys = array_keys($words_array);
-
             } else {
                 $word_keys = NULL;
                 $word_struct = NULL;
ViewGit