fixes bugs in pack and unpack since change in what stored in dictionary, a=chris

Chris Pollett [2011-02-25 05:Feb:th]
fixes bugs in pack and unpack since change in what stored in dictionary, a=chris
Filename
bin/queue_server.php
lib/index_bundle_iterators/group_iterator.php
lib/index_bundle_iterators/word_iterator.php
lib/index_shard.php
models/phrase_model.php
tests/index_shard_test.php
diff --git a/bin/queue_server.php b/bin/queue_server.php
index 8fe94ca0b..400ce668f 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -327,7 +327,9 @@ class QueueServer implements CrawlConstants
     }

     /**
-     *
+     * Even during a recrawl teh fetcher may send robot data to the
+     * queue_server. This function prints a log message and calls another
+     * function to delete this useless robot file.
      */
     function processRecrawlRobotUrls()
     {
@@ -341,7 +343,10 @@ class QueueServer implements CrawlConstants
     }

     /**
+     * Even during a recrawl teh fetcher may send robot data to the
+     * queue_server. This function delete the passed robot file.
      *
+     * @param string $file robot file to delete
      */
     function processRecrawlRobotArchive($file)
     {
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index d3db6b4de..4749994da 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -286,7 +286,7 @@ class GroupIterator extends IndexBundleIterator
                 if($item !== false) {
                     $item[self::RELEVANCE] = 0.15 *
                         $pre_out_pages[$hash_url][0][self::RELEVANCE];
-                    if(isset($item[self::DOC_RANK])) {
+                    if(!isset($item[self::DOC_RANK])) {
                         $item[self::DOC_RANK] = 0.15 *
                             $pre_out_pages[$hash_url][0][self::DOC_RANK];
                     }
diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php
index 45098a500..df9fc946b 100644
--- a/lib/index_bundle_iterators/word_iterator.php
+++ b/lib/index_bundle_iterators/word_iterator.php
@@ -146,6 +146,7 @@ class WordIterator extends IndexBundleIterator
         $this->current_block_fresh = false;
         $this->dictionary_info =
             $index->dictionary->getWordInfo($word_key, true);
+
         if ($this->dictionary_info === false) {
             $this->empty = true;
         } else {
diff --git a/lib/index_shard.php b/lib/index_shard.php
index 4a2cf452c..3ac39301d 100644
--- a/lib/index_shard.php
+++ b/lib/index_shard.php
@@ -902,6 +902,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      */
     function packWordDocs($fh = null, $to_string = false)
     {
+        if($this->word_docs_packed) {
+            return;
+        }
         $this->word_docs_len = 0;
         $this->word_docs = "";
         $total_out = "";
@@ -922,6 +925,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
             } else {
                 $out = substr($postings, self::POSTING_LEN);
                 $out[0] = chr((0x80 | ord($out[0])));
+                $this->words[$word_id] = $out;
             }
             if($fh != null) {
                 fwrite($fh, $word_id . $out);
@@ -942,6 +946,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      */
     function unpackWordDocs()
     {
+        if(!$this->word_docs_packed) {
+            return;
+        }
         foreach($this->words as $word_id => $postings_info) {
             /* we are ignoring the first four bytes which contains
                generation info
@@ -949,6 +956,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
             if((ord($postings_info[0]) & 0x80) > 0 ) {
                 $postings_info[0] = chr(ord($postings_info[0]) - 0x80);
                 $postings_info = self::HALF_BLANK . $postings_info;
+                $this->words[$word_id] = $postings_info;
             } else {
                 $offset = unpackInt(substr($postings_info, 4, 4));
                 $len = unpackInt(substr($postings_info, 8, 4));
@@ -965,7 +973,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      * occurrences of a word in the document with that docindex.
      *
      * @param int $doc_index index (i.e., a count of which document it
-     *      is rather than a byte offet) of a document in the document string
+     *      is rather than a byte offset) of a document in the document string
      * @param int $occurrences number of times a word occurred in that doc
      * @return string a packed integer containing these two pieces of info.
      */
diff --git a/models/phrase_model.php b/models/phrase_model.php
index 2e1facd7c..d4aa55e99 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -474,8 +474,10 @@ class PhraseModel extends Model
             foreach($next_docs as $doc_key => $doc_info) {
                 $summary = & $doc_info[CrawlConstants::SUMMARY];
                 unset($doc_info[CrawlConstants::SUMMARY]);
-                $pages[] = array_merge($doc_info, $summary);
-                $num_retrieved++;
+                if(is_array($summary)) {
+                    $pages[] = array_merge($doc_info, $summary);
+                    $num_retrieved++;
+                }
             }

         }
diff --git a/tests/index_shard_test.php b/tests/index_shard_test.php
index 667019f8e..c79650385 100644
--- a/tests/index_shard_test.php
+++ b/tests/index_shard_test.php
@@ -369,7 +369,14 @@ class IndexShardTest extends UnitTest
             $this->assertEqual($c_data['CCCCCCCCFFFFFFFF']
                 [CrawlConstants::SUMMARY_OFFSET],
                 0,  "Summary offset matches predicted second word");
+        $out_string = $this->test_objects['shard']->save(true);

+        $this->test_objects['shard2'] = IndexShard::load("shard.txt",
+            $out_string);
+        $this->test_objects['shard']->prefixes = NULL;
+        $this->test_objects['shard']->unpackWordDocs();
+        $this->test_objects['shard']->packWordDocs(null, true);
+        $this->test_objects['shard']->prefixes = NULL;
     }

     /**
@@ -445,6 +452,7 @@ class IndexShardTest extends UnitTest
             crawlHash('FFFFFFFF', true), 5);
         $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
             "String Load Doc lookup 2 by word works");
+
     }
 }
 ?>
ViewGit