Fixes bugs in group and intersect iterators, a=chris

Chris Pollett [2010-11-28 23:Nov:th]
Fixes bugs in group and intersect iterators, a=chris
Filename
bin/fetcher.php
configs/config.php
lib/index_bundle_iterators/group_iterator.php
lib/index_bundle_iterators/intersect_iterator.php
lib/index_bundle_iterators/word_iterator.php
lib/index_shard.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 544180d27..b39049612 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -36,7 +36,7 @@ define("BASE_DIR", substr(
     dirname(realpath($_SERVER['PHP_SELF'])), 0,
     -strlen("/bin")));

-ini_set("memory_limit","700M"); //so have enough memory to crawl big pages
+ini_set("memory_limit","750M"); //so have enough memory to crawl big pages

 /** Load in global configuration settings */
 require_once BASE_DIR.'/configs/config.php';
diff --git a/configs/config.php b/configs/config.php
index baed0fd16..3d976f46b 100755
--- a/configs/config.php
+++ b/configs/config.php
@@ -43,7 +43,7 @@ date_default_timezone_set('America/Los_Angeles');

 /*+++ The next block of code is machine edited, change at
 your own risk, please use configure web page instead +++*/
-define('WORK_DIRECTORY', '');
+define('WORK_DIRECTORY', '/Applications/xampp/xamppfiles/htdocs/crawls');
 /*++++++*/

 if(file_exists(WORK_DIRECTORY."/profile.php")) {
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index 93b49211c..2d2e30bbc 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -167,7 +167,6 @@ class GroupIterator extends IndexBundleIterator
                         }
                     }
                 } else {
-
                     $pre_out_pages[$hash_url][] = $doc_info;
                     if($doc_info['IS_PAGE'] == true) {
                         $pre_out_pages[$hash_url]['IS_PAGE'] = true;
@@ -190,9 +189,10 @@ class GroupIterator extends IndexBundleIterator
                     if(is_array($doc_array) && count($doc_array) == 1) {
                         $keys = array_keys($doc_array);
                         $key = $keys[0];
-                        if(!isset($doc_array[$key][self::DUPLICATE]) ) {
+                        if(!isset($doc_array[$key][self::DUPLICATE]) ) {;
                             $pre_out_pages[$hash_url][$key] = $doc_array[$key];
                             $pre_out_pages[$hash_url][$key]['IS_PAGE'] = true;
+                            $pre_out_pages[$hash_url][$key]['KEY'] = $key;
                         } else {
                             /*
                                 Deduplication:
@@ -218,7 +218,7 @@ class GroupIterator extends IndexBundleIterator
                 foreach($group_infos as $doc_info) {
                     $is_page = $doc_info['IS_PAGE'];
                     unset($doc_info['IS_PAGE']);
-                    if(!isset($out_pages[$hash_url])) {
+                    if(!isset($out_pages[$hash_url]) || $is_page) {
                         $out_pages[$hash_url] = $doc_info;
                         $out_pages[$hash_url][self::SUMMARY_OFFSET] = array();
                         if(isset($doc_info[self::SUMMARY_OFFSET]) ) {
diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php
index 62f36b691..31f18f3a8 100644
--- a/lib/index_bundle_iterators/intersect_iterator.php
+++ b/lib/index_bundle_iterators/intersect_iterator.php
@@ -97,7 +97,7 @@ class IntersectIterator extends IndexBundleIterator
         $this->index_bundle_iterators = $index_bundle_iterators;

         $this->num_iterators = count($index_bundle_iterators);
-        $this->num_docs = -1;
+        $this->num_docs = 0;
         $this->results_per_block = 1;

         /*
@@ -107,10 +107,7 @@ class IntersectIterator extends IndexBundleIterator
              iterator
         */
         for($i = 0; $i < $this->num_iterators; $i++) {
-            if( $this->num_docs < 0 ||
-                $this->index_bundle_iterators[$i]->num_docs < $this->num_docs) {
-                $this->num_docs = $this->index_bundle_iterators[$i]->num_docs;
-            }
+            $this->num_docs += $this->index_bundle_iterators[$i]->num_docs;
             $this->index_bundle_iterators[$i]->setResultsPerBlock(1);
         }
         $this->reset();
@@ -157,19 +154,20 @@ class IntersectIterator extends IndexBundleIterator
     function syncDocOffsetsAmongstIterators()
     {
         $biggest_offset = 0;
-        $all_same = true;
         do{
+            $all_same = true;
             for($i = 0; $i < $this->num_iterators; $i++) {
-                $new_doc_offset =
-                    $this->index_bundle_iterators[$i]->currentDocOffsetWithWord();
+                $new_doc_offset[$i] =
+                    $this->index_bundle_iterators[
+                        $i]->currentDocOffsetWithWord();
                 if($i == 0) {
-                    $biggest_offset = $new_doc_offset;
+                    $biggest_offset = $new_doc_offset[$i];
                 }
-                if($new_doc_offset == -1) {
+                if($new_doc_offset[$i] == -1) {
                     return -1;
                 }
-                if($new_doc_offset > $biggest_offset) {
-                    $biggest_offset = $new_doc_offset;
+                if($new_doc_offset[$i] > $biggest_offset) {
+                    $biggest_offset = $new_doc_offset[$i];
                     $all_same = false;
                 }
             }
@@ -177,7 +175,9 @@ class IntersectIterator extends IndexBundleIterator
                 return 1;
             }
             for($i = 0; $i < $this->num_iterators; $i++) {
-                $this->index_bundle_iterators[$i]->advance($biggest_offset);
+                if($new_doc_offset[$i] < $biggest_offset) {
+                    $this->index_bundle_iterators[$i]->advance($biggest_offset);
+                }
             }
         } while(!$all_same);
     }
@@ -198,15 +198,13 @@ class IntersectIterator extends IndexBundleIterator
         for($i = 0; $i < $this->num_iterators; $i++) {
              $this->seen_docs_unfiltered +=
                 $this->index_bundle_iterators[$i]->seen_docs;
-            $total_num_docs = $this->index_bundle_iterators[$i]->num_docs;
+            $total_num_docs += $this->index_bundle_iterators[$i]->num_docs;
         }
         if($this->seen_docs_unfiltered > 0) {
             $this->num_docs =
                 floor(($this->seen_docs * $total_num_docs) /
                 $this->seen_docs_unfiltered);
-        } else {
-            $this->num_docs = 0;
-        }
+        }

         $this->index_bundle_iterators[0]->advance($doc_offset);

diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php
index 42db86f03..25e4846c7 100644
--- a/lib/index_bundle_iterators/word_iterator.php
+++ b/lib/index_bundle_iterators/word_iterator.php
@@ -171,6 +171,7 @@ class WordIterator extends IndexBundleIterator
             return -1;
         }
         $this->next_offset = $this->current_offset;
+        //the next call also updates next offset
         $results = $this->index->getCurrentShard()->getPostingsSlice(
             $this->next_offset, $this->last_offset, $this->results_per_block);
         return $results;
@@ -192,6 +193,10 @@ class WordIterator extends IndexBundleIterator
                     $this->index->getCurrentShard(
                         )->nextPostingOffsetDocOffset($this->next_offset,
                             $this->last_offset, $doc_offset);
+                if($this->current_offset === false) {
+                    $this->current_offset = $this->last_offset + 1;
+                    return;
+                }
                 $this->seen_docs =
                     ($this->current_offset - $this->start_offset)/
                         IndexShard::POSTING_LEN;
diff --git a/lib/index_shard.php b/lib/index_shard.php
index 188fb45a1..09b9c2f2e 100644
--- a/lib/index_shard.php
+++ b/lib/index_shard.php
@@ -356,13 +356,14 @@ class IndexShard extends PersistentStructure implements CrawlConstants
             ($last_offset - $next_offset) >> 2
             : 1;
         $results = array();
+        $end = min($this->word_docs_len, $last_offset);
         do {
-            if($next_offset >= $this->word_docs_len) {break;}
+            if($next_offset > $end) {break;}
             $item = array();
             $posting = $this->getWordDocsSubstring($next_offset, 4);
             list($doc_index, $occurrences) = $this->unpackPosting($posting);
             $old_next_offset = $next_offset;
-            $next_offset += 4;
+            $next_offset += self::POSTING_LEN;
             $doc_depth = log(10*(($doc_index +1) +
                 $this->generation_offset)*NUM_FETCHERS, 10);
             $item[self::DOC_RANK] = number_format(11 -
@@ -370,8 +371,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants
             $doc_loc = $doc_index << 4;
             $doc_info_string = $this->getDocInfoSubstring($doc_loc, 12);
             $doc_id = substr($doc_info_string, 0, 8);
-            $tmp = unpack("N", substr($doc_info_string, 8, 4));
-            $item[self::SUMMARY_OFFSET] = $tmp[1];
+            $item[self::SUMMARY_OFFSET] = $this->unpackInt(
+                substr($doc_info_string, 8, 4));
             $is_doc = false;
             $skip_stats = false;

@@ -379,7 +380,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants
                 $item[self::SUMMARY_OFFSET] == self::NEEDS_OFFSET_FLAG) {
                 $skip_stats = true;
                 $item[self::DUPLICATE] = true;
-            } else if(($tmp[1] & self::COMPOSITE_ID_FLAG) !== 0) {
+            } else if(($item[self::SUMMARY_OFFSET]
+                & self::COMPOSITE_ID_FLAG) !== 0) {
                 //handles link item case
                 $item[self::SUMMARY_OFFSET] ^= self::COMPOSITE_ID_FLAG;
                 $doc_loc += 12;
@@ -410,8 +412,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants
                 $IDF = ($num_docs - $num_term_occurrences + 0.5) /
                     ($num_term_occurrences + 0.5);
                 $item[self::RELEVANCE] = $IDF * $pre_relevance;
+
                 $item[self::SCORE] = $item[self::DOC_RANK] +
-                    0.1*$item[self::RELEVANCE];
+                    .1/ ($item[self::RELEVANCE] + .1);
             }
             $results[$doc_id] = $item;
             $num_docs_so_far ++;
@@ -869,7 +872,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants

         do {
             $data = $this->readBlockShardAtOffset($block_offset);
-            if($data == false) {return $substring;}
+            if($data === false) {return $substring;}
             $block_offset += self::SHARD_BLOCK_SIZE;
             $substring .= substr($data, $start_loc);
             $start_loc = 0;
ViewGit