fixes a bug in word_iterators advance method and another modifies sitemap processor to not deal with gz files as cant handle them yet, a=chris
fixes a bug in word_iterators advance method and another modifies sitemap processor to not deal with gz files as cant handle them yet, a=chris
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index fcc4da265..d7f5dc8af 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -186,6 +186,7 @@ class GroupIterator extends IndexBundleIterator
$pages = $this->computeBoostAndOutPages($pre_out_pages);
}
$this->pages = $pages;
+
return $pages;
}
diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php
index a3819b39f..d68f5d151 100644
--- a/lib/index_bundle_iterators/intersect_iterator.php
+++ b/lib/index_bundle_iterators/intersect_iterator.php
@@ -33,11 +33,6 @@
if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
-/**
- *Loads BloomFilterFile to remember things we've already grouped
- */
-require_once BASE_DIR.'/lib/bloom_filter_file.php';
-
/**
*Loads base class for iterating
@@ -185,13 +180,15 @@ class IntersectIterator extends IndexBundleIterator
}
/**
- *
+ * Finds the next generation and doc offet amongst all the iterators
+ * that contains the word. It assumes that the (generation, doc offset)
+ * pairs are ordered in an increasing fashion for the underlying iterators
*/
function syncGenDocOffsetsAmongstIterators()
{
$biggest_gen_offset = NULL;
do{
- $all_same = true;
+ $all_same = true;
for($i = 0; $i < $this->num_iterators; $i++) {
$new_gen_doc_offset[$i] =
$this->index_bundle_iterators[
diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php
index 8932663da..2c7f686ea 100644
--- a/lib/index_bundle_iterators/word_iterator.php
+++ b/lib/index_bundle_iterators/word_iterator.php
@@ -230,6 +230,7 @@ class WordIterator extends IndexBundleIterator
$this->start_offset,
$this->next_offset, $this->last_offset, $this->results_per_block);
$this->count_block = count($results);
+
return $results;
}
@@ -255,7 +256,9 @@ class WordIterator extends IndexBundleIterator
$last_current_generation != $this->current_generation) {
$this->advanceGeneration();
$last_current_generation = $this->current_generation;
+ $this->next_offset = $this->current_offset;
}
+
$this->index->setCurrentShard($this->current_generation, true);
$this->current_offset =
@@ -276,7 +279,8 @@ class WordIterator extends IndexBundleIterator
}
/**
- *
+ * Switches which index shard is being used to return occurences of
+ * the nord to the next shard containing the word
*/
function advanceGeneration()
{
diff --git a/lib/processors/sitemap_processor.php b/lib/processors/sitemap_processor.php
index 0770bda85..568e43cc1 100644
--- a/lib/processors/sitemap_processor.php
+++ b/lib/processors/sitemap_processor.php
@@ -137,7 +137,9 @@ class SitemapProcessor extends TextProcessor
$url = UrlParser::canonicalLink(
$node->textContent, $site);
if($url === NULL || $url === "" ||
- UrlParser::checkRecursiveUrl($url)) {
+ UrlParser::checkRecursiveUrl($url) ||
+ UrlParser::getDocumentType($url) == "gz") {
+ //at this point we can't handle gzip'd sitemaps
continue;
}
$sites[$url] = "From sitemap of ".$site;
diff --git a/models/phrase_model.php b/models/phrase_model.php
index 6b6857add..7d93ff99c 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -339,15 +339,13 @@ class PhraseModel extends Model
if(is_array($words_array)) {
$counts = array_values($words_array);
$min_count = min($counts);
- $threshold = 3*$min_count;
+ $threshold = 4000*$min_count;
$word_keys = array();
foreach($words_array as $key => $count) {
if($count < $threshold) {
$word_keys[] = $key;
}
}
- $word_keys = array_keys($words_array);
-
} else {
$word_keys = NULL;
$word_struct = NULL;