fixes bugs in pack and unpack since change in what stored in dictionary, a=chris
fixes bugs in pack and unpack since change in what stored in dictionary, a=chris
diff --git a/bin/queue_server.php b/bin/queue_server.php
index 8fe94ca0b..400ce668f 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -327,7 +327,9 @@ class QueueServer implements CrawlConstants
}
/**
- *
+ * Even during a recrawl teh fetcher may send robot data to the
+ * queue_server. This function prints a log message and calls another
+ * function to delete this useless robot file.
*/
function processRecrawlRobotUrls()
{
@@ -341,7 +343,10 @@ class QueueServer implements CrawlConstants
}
/**
+ * Even during a recrawl teh fetcher may send robot data to the
+ * queue_server. This function delete the passed robot file.
*
+ * @param string $file robot file to delete
*/
function processRecrawlRobotArchive($file)
{
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index d3db6b4de..4749994da 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -286,7 +286,7 @@ class GroupIterator extends IndexBundleIterator
if($item !== false) {
$item[self::RELEVANCE] = 0.15 *
$pre_out_pages[$hash_url][0][self::RELEVANCE];
- if(isset($item[self::DOC_RANK])) {
+ if(!isset($item[self::DOC_RANK])) {
$item[self::DOC_RANK] = 0.15 *
$pre_out_pages[$hash_url][0][self::DOC_RANK];
}
diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php
index 45098a500..df9fc946b 100644
--- a/lib/index_bundle_iterators/word_iterator.php
+++ b/lib/index_bundle_iterators/word_iterator.php
@@ -146,6 +146,7 @@ class WordIterator extends IndexBundleIterator
$this->current_block_fresh = false;
$this->dictionary_info =
$index->dictionary->getWordInfo($word_key, true);
+
if ($this->dictionary_info === false) {
$this->empty = true;
} else {
diff --git a/lib/index_shard.php b/lib/index_shard.php
index 4a2cf452c..3ac39301d 100644
--- a/lib/index_shard.php
+++ b/lib/index_shard.php
@@ -902,6 +902,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants
*/
function packWordDocs($fh = null, $to_string = false)
{
+ if($this->word_docs_packed) {
+ return;
+ }
$this->word_docs_len = 0;
$this->word_docs = "";
$total_out = "";
@@ -922,6 +925,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
} else {
$out = substr($postings, self::POSTING_LEN);
$out[0] = chr((0x80 | ord($out[0])));
+ $this->words[$word_id] = $out;
}
if($fh != null) {
fwrite($fh, $word_id . $out);
@@ -942,6 +946,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants
*/
function unpackWordDocs()
{
+ if(!$this->word_docs_packed) {
+ return;
+ }
foreach($this->words as $word_id => $postings_info) {
/* we are ignoring the first four bytes which contains
generation info
@@ -949,6 +956,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
if((ord($postings_info[0]) & 0x80) > 0 ) {
$postings_info[0] = chr(ord($postings_info[0]) - 0x80);
$postings_info = self::HALF_BLANK . $postings_info;
+ $this->words[$word_id] = $postings_info;
} else {
$offset = unpackInt(substr($postings_info, 4, 4));
$len = unpackInt(substr($postings_info, 8, 4));
@@ -965,7 +973,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
* occurrences of a word in the document with that docindex.
*
* @param int $doc_index index (i.e., a count of which document it
- * is rather than a byte offet) of a document in the document string
+ * is rather than a byte offset) of a document in the document string
* @param int $occurrences number of times a word occurred in that doc
* @return string a packed integer containing these two pieces of info.
*/
diff --git a/models/phrase_model.php b/models/phrase_model.php
index 2e1facd7c..d4aa55e99 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -474,8 +474,10 @@ class PhraseModel extends Model
foreach($next_docs as $doc_key => $doc_info) {
$summary = & $doc_info[CrawlConstants::SUMMARY];
unset($doc_info[CrawlConstants::SUMMARY]);
- $pages[] = array_merge($doc_info, $summary);
- $num_retrieved++;
+ if(is_array($summary)) {
+ $pages[] = array_merge($doc_info, $summary);
+ $num_retrieved++;
+ }
}
}
diff --git a/tests/index_shard_test.php b/tests/index_shard_test.php
index 667019f8e..c79650385 100644
--- a/tests/index_shard_test.php
+++ b/tests/index_shard_test.php
@@ -369,7 +369,14 @@ class IndexShardTest extends UnitTest
$this->assertEqual($c_data['CCCCCCCCFFFFFFFF']
[CrawlConstants::SUMMARY_OFFSET],
0, "Summary offset matches predicted second word");
+ $out_string = $this->test_objects['shard']->save(true);
+ $this->test_objects['shard2'] = IndexShard::load("shard.txt",
+ $out_string);
+ $this->test_objects['shard']->prefixes = NULL;
+ $this->test_objects['shard']->unpackWordDocs();
+ $this->test_objects['shard']->packWordDocs(null, true);
+ $this->test_objects['shard']->prefixes = NULL;
}
/**
@@ -445,6 +452,7 @@ class IndexShardTest extends UnitTest
crawlHash('FFFFFFFF', true), 5);
$this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
"String Load Doc lookup 2 by word works");
+
}
}
?>