more work fixing group_iterator glitches, a=chris
more work fixing group_iterator glitches, a=chris
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index 98a814979..e3cec25f8 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -90,6 +90,12 @@ class GroupIterator extends IndexBundleIterator
*/
var $grouped_keys;
+ /**
+ *
+ * @var array
+ */
+ var $grouped_hashes;
+
/**
* the minimum number of pages to group from a block;
* this trumps $this->index_bundle_iterator->results_per_block
@@ -121,6 +127,7 @@ class GroupIterator extends IndexBundleIterator
{
$this->index_bundle_iterator->reset();
$this->grouped_keys = array();
+ $this->grouped_hashes = array();
// -1 == never save, so file name not used using time to be safer
$this->seen_docs = 0;
$this->seen_docs_unfiltered = 0;
@@ -179,7 +186,7 @@ class GroupIterator extends IndexBundleIterator
*/
$this->current_block_hashes = array();
$pre_out_pages = array();
- $seen_hashes = array();
+ $this->current_seen_hashes = array();
if($this->count_block_unfiltered > 0 ) {
$i = $this->seen_docs;
foreach($pages as $doc_key => $doc_info) {
@@ -215,14 +222,15 @@ class GroupIterator extends IndexBundleIterator
new urls found in this block
*/
$this->current_block_hashes[] = $hash_url;
- $i++;
+ } else {
+ unset($pre_out_pages[$hash_url]);
}
}
/*get summary page for groups of link data if exists and don't have
also aggregate by hash
*/
- $seen_hashes = array();
+ $this->current_seen_hashes = array();
foreach($pre_out_pages as $hash_url => $data) {
if(!isset($pre_out_pages[$hash_url]['IS_PAGE'])) {
$hash_info_url= $pre_out_pages[$hash_url]['HASH_INFO_URL'];
@@ -249,23 +257,26 @@ class GroupIterator extends IndexBundleIterator
}
if(isset($pre_out_pages[$hash_url]['HASH_INFO_URL'])) {
unset($pre_out_pages[$hash_url]['HASH_INFO_URL']);
- }
+ }
if(isset($pre_out_pages[$hash_url][0][self::HASH])) {
$hash = $pre_out_pages[$hash_url][0][self::HASH];
- if(isset($seen_hashes[$hash])) {
- $previous_url = $seen_hashes[$hash];
+ if(isset($this->grouped_hashes[$hash])) {
+ unset($pre_out_pages[$hash_url]);
+ } else if(isset($this->current_seen_hashes[$hash])) {
+ $previous_url = $this->current_seen_hashes[$hash];
if($pre_out_pages[$previous_url][0][
self::HASH_URL_COUNT] >=
count($pre_out_pages[$hash_url])) {
unset($pre_out_pages[$hash_url]);
} else {
- $seen_hashes[$hash] = $hash_url;
+ $this->current_seen_hashes[$hash] = $hash_url;
$pre_out_pages[$hash_url][0][self::HASH_URL_COUNT] =
count($pre_out_pages[$hash_url]);
unset($pre_out_pages[$previous_url]);
}
} else {
- $seen_hashes[$hash] = $hash_url;
+ $i++;
+ $this->current_seen_hashes[$hash] = $hash_url;
$pre_out_pages[$hash_url][0][self::HASH_URL_COUNT] =
count($pre_out_pages[$hash_url]);
}
@@ -273,6 +284,9 @@ class GroupIterator extends IndexBundleIterator
}
$this->count_block = count($pre_out_pages);
+ /*
+ Calculate grouped values for each field of the groups we found
+ */
$out_pages = array();
foreach($pre_out_pages as $hash_url => $group_infos) {
foreach($group_infos as $doc_info) {
@@ -393,12 +407,17 @@ class GroupIterator extends IndexBundleIterator
{
$this->advanceSeenDocs();
- $this->seen_docs_unfiltered += $this->count_block_unfiltered;
+ $this->seen_docs_unfiltered += $this->count_block_unfiltered;
if($this->seen_docs_unfiltered > 0) {
- $this->num_docs =
- floor(($this->seen_docs*$this->index_bundle_iterator->num_docs)/
- $this->seen_docs_unfiltered);
+ if($this->count_block_unfiltered < $this->results_per_block) {
+ $this->num_docs = $this->seen_docs;
+ } else {
+ $this->num_docs =
+ floor(
+ ($this->seen_docs*$this->index_bundle_iterator->num_docs)/
+ $this->seen_docs_unfiltered);
+ }
} else {
$this->num_docs = 0;
}
@@ -408,6 +427,10 @@ class GroupIterator extends IndexBundleIterator
$this->grouped_keys[$hash_url] = true;
}
+ foreach($this->current_seen_hashes as $hash) {
+ $this->grouped_hashes[$hash] = true;
+ }
+
$this->index_bundle_iterator->advance($gen_doc_offset);
}
diff --git a/locale/en-US/configure.ini b/locale/en-US/configure.ini
index 35209cd4c..9cc2d360d 100755
--- a/locale/en-US/configure.ini
+++ b/locale/en-US/configure.ini
@@ -721,10 +721,10 @@ pagination_helper_next = "Next"
; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/layouts
;
; web_layout.php line: 65
-web_layout_title = "Yioop! PHP Search Engine"
+web_layout_title = "PHP Search Engine - Yioop!"
;
; web_layout.php line: 70
-web_layout_description = "Help find what you are searching for"
+web_layout_description = "PHP Crawler and Search Engine Software"
;
; web_layout.php line: 87
web_layout_query_statistics = "Query Statistics"
diff --git a/locale/en-US/statistics.txt b/locale/en-US/statistics.txt
index b6bef56f0..5a165df53 100755
--- a/locale/en-US/statistics.txt
+++ b/locale/en-US/statistics.txt
@@ -1 +1 @@
-d:99;
\ No newline at end of file
+d:100;
\ No newline at end of file
diff --git a/models/phrase_model.php b/models/phrase_model.php
index 7c89640ed..fa88d279c 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -463,12 +463,10 @@ class PhraseModel extends Model
unset($doc_info[CrawlConstants::SUMMARY]);
$pages[] = array_merge($doc_info, $summary);
$num_retrieved++;
- if($num_retrieved >= $to_retrieve) {
- break 2;
- }
}
}
+
usort($pages, "scoreOrderCallback");
if($num_retrieved < $to_retrieve) {