diff --git a/bin/fetcher.php b/bin/fetcher.php
index deea0c19e..30f0b2bfb 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -466,13 +466,10 @@ class Fetcher implements CrawlConstants
$this->to_crawl = array();
while($tok !== false) {
$string = base64_decode($tok);
- $tmp = unpack("f", substr($string, 0 , 4));
- $weight = $tmp[1];
- $tmp = unpack("N", substr($string, 4 , 4));
- $delay = $tmp[1];
+ $weight = unpackFloat(substr($string, 0 , 4));
+ $delay = unpackInt(substr($string, 4 , 4));
$url = substr($string, 8);
$this->to_crawl[] = array($url, $weight, $delay);
-
$tok = strtok("\n");
}
}
@@ -1083,7 +1080,7 @@ class Fetcher implements CrawlConstants
$site = $this->found_sites[self::SEEN_URLS][$i];
if(!isset($site[self::HASH])) {continue; }
$doc_keys = crawlHash($site[self::URL], true) .
- $site[self::HASH];
+ $site[self::HASH]. crawlHash("link:".$site[self::URL], true);
$word_counts = array();
$phrase_string =
mb_ereg_replace("[[:punct:]]", " ", $site[self::TITLE] .
@@ -1153,8 +1150,8 @@ class Fetcher implements CrawlConstants
$link_text = strip_tags($link_text);
$link_id =
"url|".$url."|text|$link_text|ref|".$site[self::URL];
- $link_keys = crawlHash($link_id, true) .
- crawlHash($url, true) .
+ $link_keys = crawlHash($url, true) .
+ crawlHash($link_id, true) .
crawlHash("info:".$url, "true");
$summary[self::URL] = $link_id;
$summary[self::TITLE] = $url;
@@ -1162,10 +1159,7 @@ class Fetcher implements CrawlConstants
$summary[self::DESCRIPTION] = $link_text;
$summary[self::TIMESTAMP] = $site[self::TIMESTAMP];
$summary[self::ENCODING] = $site[self::ENCODING];
- $summary[self::HASH] = false; /*
- link id's will always be unique so no sense
- deduplicating them
- */
+ $summary[self::HASH] = $link_id;
$summary[self::TYPE] = "link";
$summary[self::HTTP_CODE] = "link";
$this->found_sites[self::SEEN_URLS][] = $summary;
@@ -1183,7 +1177,7 @@ class Fetcher implements CrawlConstants
}
$index_shard->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG,
- $word_counts, $meta_ids);
+ $word_counts, $meta_ids, true);
$index_shard->appendIndexShard($link_shard);
diff --git a/bin/queue_server.php b/bin/queue_server.php
index d17944161..d9aaf65cf 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -38,7 +38,7 @@ define("BASE_DIR", substr(
dirname(realpath($_SERVER['PHP_SELF'])), 0,
-strlen("/bin")));
-ini_set("memory_limit","950M"); //so have enough memory to crawl big pages
+ini_set("memory_limit","1100M"); //so have enough memory to crawl big pages
/** Load in global configuration settings */
require_once BASE_DIR.'/configs/config.php';
@@ -595,16 +595,18 @@ class QueueServer implements CrawlConstants
if(strcmp("url", $link_url_parts[0]) == 0 &&
strcmp("text", $link_url_parts[2]) == 0) {
$seen_sites[$i][self::HASH_URL] =
- crawlHash($seen_sites[$i][self::URL], true)
- . crawlHash($link_url_parts[1],true)
+ crawlHash($link_url_parts[1],true)
+ . crawlHash($seen_sites[$i][self::URL], true)
. crawlHash("info:".$link_url_parts[1], true);
+ $seen_sites[$i][self::IS_DOC] = false;
} else {
+ $seen_sites[$i][self::IS_DOC] = true;
$visited_urls_count++;
}
}
if(isset($sites[self::INVERTED_INDEX])) {
- $index_shard = $sites[self::INVERTED_INDEX];
+ $index_shard = & $sites[self::INVERTED_INDEX];
if($index_shard->word_docs_packed) {
$index_shard->unpackWordDocs();
}
@@ -618,8 +620,10 @@ class QueueServer implements CrawlConstants
$visited_urls_count);
foreach($seen_sites as $site) {
- if($site[self::HASH] !== false){ // so not link
- $hash = $site[self::HASH_URL].$site[self::HASH];
+ if($site[self::IS_DOC]){ // so not link
+ $hash = $site[self::HASH_URL].
+ $site[self::HASH] .
+ crawlHash("link:".$site[self::URL], true);
} else {
$hash = $site[self::HASH_URL];
}
@@ -1153,7 +1157,7 @@ class QueueServer implements CrawlConstants
foreach($sites[self::SITES] as $site) {
list($url, $weight, $delay) = $site;
$out_string = base64_encode(
- pack("f", $weight).pack("N", $delay).$url)."\n";
+ packFloat($weight).packInt($delay).$url)."\n";
fwrite($fh, $out_string);
}
fclose($fh);
diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php
index 61a462f7c..95cba7659 100644
--- a/lib/crawl_constants.php
+++ b/lib/crawl_constants.php
@@ -136,7 +136,11 @@ interface CrawlConstants
const META_WORDS ='ao';
const CACHE_PAGE_PARTITION = 'ap';
const GENERATION = 'aq';
- const HASH_URL_COUNT = 'ar';
+ const HASH_SUM_SCORE = 'ar';
+ const HASH_URL_COUNT = 'as';
+ const IS_DOC = 'at';
+ const BOOST = 'av';
+
const NEEDS_OFFSET_FLAG = 0x7FFFFFFF;
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index e3cec25f8..d2dc756af 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -154,13 +154,52 @@ class GroupIterator extends IndexBundleIterator
* @return mixed doc ids and score if there are docs left, -1 otherwise
*/
function findDocsWithWord()
+ {
+ // first get a block of documents on which grouping can be done
+ $pages = $this->getPagesToGroup();
+ $this->count_block_unfiltered = count($pages);
+ if(!is_array($pages)) {
+ return $pages;
+ }
+
+ $this->current_block_hashes = array();
+
+ $this->current_seen_hashes = array();
+ if($this->count_block_unfiltered > 0 ) {
+ /* next we group like documents by url and remember which urls we've
+ seen this block
+ */
+ $pre_out_pages = $this->groupByHashUrl($pages);
+
+ /*get doc page for groups of link data if exists and don't have
+ also aggregate by hash
+ */
+ $this->groupByHashAndAggregate($pre_out_pages);
+ $this->count_block = count($pre_out_pages);
+
+ /*
+ Calculate aggregate values for each field of the groups we found
+ */
+ $pages = $this->computeBoostAndOutPages($pre_out_pages);
+ }
+ $this->pages = $pages;
+ return $pages;
+
+ }
+
+ /**
+ * Gets a sample of a few hundred pages on which to do grouping by URL
+ *
+ * @return array of pages of document key --> meta data arrays
+ */
+ function getPagesToGroup()
{
$pages = array();
$count = 0;
$done = false;
- // first get a block of documents on which grouping can be done
do {
$new_pages = $this->index_bundle_iterator->currentDocsWithWord();
+
if(!is_array($new_pages)) {
$done = true;
if(count($pages) == 0) {
@@ -176,167 +215,234 @@ class GroupIterator extends IndexBundleIterator
$done = true;
}
} while(!$done);
- $this->count_block_unfiltered = count($pages);
- if(!is_array($pages)) {
- return $pages;
- }
+ return $pages;
+ }
- /* next we group like documents by url and remember which urls we've
- seen this block
- */
- $this->current_block_hashes = array();
+ /**
+ * Groups documents as well as mini-pages based on links to documents by
+ * url to produce an array of arrays of documents with same url. Since
+ * this is called in an iterator, documents which were already returned by
+ * a previous call to currentDocsWithWord() followed by an advance() will
+ * have been remembered in grouped_keys and will be ignored in the return
+ * result of this function.
+ *
+ * @param array &$pages pages to group
+ * @return array $pre_out_pages pages after grouping
+ */
+ function groupByHashUrl(&$pages)
+ {
$pre_out_pages = array();
- $this->current_seen_hashes = array();
- if($this->count_block_unfiltered > 0 ) {
- $i = $this->seen_docs;
- foreach($pages as $doc_key => $doc_info) {
- if(!is_array($doc_info) || $doc_info[self::SUMMARY_OFFSET] ==
- self::NEEDS_OFFSET_FLAG) {continue;}
- $doc_info['KEY'] = $doc_key;
- if(strlen($doc_key) == 16) {
- $hash_url = substr($doc_key, 0, 8);
- $doc_info[self::HASH] = substr($doc_key, 8);
- if(!isset($pre_out_pages[$hash_url])) {
- $pre_out_pages[$hash_url] = array();
- }
- array_unshift($pre_out_pages[$hash_url], $doc_info);
- } else {
- $doc_key_parts = array(
- substr($doc_key, 0, 8),substr($doc_key, 8, 8),
- substr($doc_key, 16, 8)
- );
- $hash_url = $doc_key_parts[1];
- $doc_info[self::HASH] = false;
- $pre_out_pages[$hash_url][] = $doc_info;
- }
-
- if($doc_info[self::HASH] !== false) {
- $pre_out_pages[$hash_url]['IS_PAGE'] = true;
- } else {
- $pre_out_pages[$hash_url]['HASH_INFO_URL'] =
- $doc_key_parts[2];
+ foreach($pages as $doc_key => $doc_info) {
+ if(!is_array($doc_info) || $doc_info[self::SUMMARY_OFFSET] ==
+ self::NEEDS_OFFSET_FLAG) {continue;}
+ $doc_info['KEY'] = $doc_key;
+ $hash_url = substr($doc_key, 0, IndexShard::DOC_KEY_LEN);
+ $doc_info[self::HASH] = substr($doc_key,
+ IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN);
+ $doc_info[self::INLINKS] = substr($doc_key,
+ 2*IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN);
+ if($doc_info[self::IS_DOC]) {
+ if(!isset($pre_out_pages[$hash_url])) {
+ $pre_out_pages[$hash_url] = array();
}
+ array_unshift($pre_out_pages[$hash_url], $doc_info);
+ } else {
+ $pre_out_pages[$hash_url][] = $doc_info;
+ }
- if(!isset($this->grouped_keys[$hash_url])) {
- /*
- new urls found in this block
- */
- $this->current_block_hashes[] = $hash_url;
- } else {
- unset($pre_out_pages[$hash_url]);
- }
+ if(!isset($this->grouped_keys[$hash_url])) {
+ /*
+ new urls found in this block
+ */
+ $this->current_block_hashes[] = $hash_url;
+ } else {
+ unset($pre_out_pages[$hash_url]);
}
+ }
- /*get summary page for groups of link data if exists and don't have
- also aggregate by hash
- */
- $this->current_seen_hashes = array();
- foreach($pre_out_pages as $hash_url => $data) {
- if(!isset($pre_out_pages[$hash_url]['IS_PAGE'])) {
- $hash_info_url= $pre_out_pages[$hash_url]['HASH_INFO_URL'];
- $word_iterator =
- new WordIterator($hash_info_url,
- $this->getIndex(), true);
- $doc_array = $word_iterator->currentDocsWithWord();
-
- if(is_array($doc_array) && count($doc_array) == 1) {
- $relevance = $this->computeRelevance(
- $word_iterator->current_generation,
- $word_iterator->current_offset);
- $keys = array_keys($doc_array);
- $key = $keys[0];
- $item = $doc_array[$key];
- $item[self::RELEVANCE] += $relevance;
- $item[self::SCORE] += $relevance;
- $item['KEY'] = substr($key, 0, 8);
- $item[self::HASH] = substr($key, 8, 8);
- array_unshift($pre_out_pages[$hash_url], $item);
- }
- } else {
- unset($pre_out_pages[$hash_url]['IS_PAGE']);
- }
- if(isset($pre_out_pages[$hash_url]['HASH_INFO_URL'])) {
- unset($pre_out_pages[$hash_url]['HASH_INFO_URL']);
+ return $pre_out_pages;
+ }
+
+ /**
+ * For documents which had been previously grouped by the hash of their
+ * url, groups these groups further by the hash of their pages contents.
+ * For each group of groups with the same hash summary, this function
+ * then selects the subgroup of with the highest aggregate score for
+ * that group as its representative. The function then modifies the
+ * supplied argument array to make it an array of group representatives.
+ *
+ * @param array &$pre_out_pages documents previously grouped by hash of url
+ */
+ function groupByHashAndAggregate(&$pre_out_pages)
+ {
+ foreach($pre_out_pages as $hash_url => $data) {
+ if(!$pre_out_pages[$hash_url][0][self::IS_DOC]) {
+ $hash_info_url=
+ $pre_out_pages[$hash_url][0][self::INLINKS];
+ $word_iterator =
+ new WordIterator($hash_info_url,
+ $this->getIndex(), true);
+ $doc_array = $word_iterator->currentDocsWithWord();
+
+ if(is_array($doc_array) && count($doc_array) == 1) {
+ $relevance = $this->computeRelevance(
+ $word_iterator->current_generation,
+ $word_iterator->current_offset);
+ $keys = array_keys($doc_array);
+ $key = $keys[0];
+ $item = $doc_array[$key];
+ $item[self::RELEVANCE] = $relevance;
+ $item[self::SCORE] += $relevance;
+ $item['KEY'] = $key;
+ $item[self::HASH] = substr($key,
+ IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN);
+ $item[self::INLINKS] = substr($key,
+ 2*IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN);
+ array_unshift($pre_out_pages[$hash_url], $item);
}
- if(isset($pre_out_pages[$hash_url][0][self::HASH])) {
- $hash = $pre_out_pages[$hash_url][0][self::HASH];
- if(isset($this->grouped_hashes[$hash])) {
+ }
+
+ $this->aggregateScores($pre_out_pages[$hash_url]);
+
+ if(isset($pre_out_pages[$hash_url][0][self::HASH])) {
+ $hash = $pre_out_pages[$hash_url][0][self::HASH];
+ if(isset($this->grouped_hashes[$hash])) {
+ unset($pre_out_pages[$hash_url]);
+ } else if(isset($this->current_seen_hashes[$hash])) {
+ $previous_url = $this->current_seen_hashes[$hash];
+ if($pre_out_pages[$previous_url][0][
+ self::HASH_SUM_SCORE] >=
+ $pre_out_pages[$hash_url][0][self::HASH_SUM_SCORE]) {
unset($pre_out_pages[$hash_url]);
- } else if(isset($this->current_seen_hashes[$hash])) {
- $previous_url = $this->current_seen_hashes[$hash];
- if($pre_out_pages[$previous_url][0][
- self::HASH_URL_COUNT] >=
- count($pre_out_pages[$hash_url])) {
- unset($pre_out_pages[$hash_url]);
- } else {
- $this->current_seen_hashes[$hash] = $hash_url;
- $pre_out_pages[$hash_url][0][self::HASH_URL_COUNT] =
- count($pre_out_pages[$hash_url]);
- unset($pre_out_pages[$previous_url]);
- }
} else {
- $i++;
$this->current_seen_hashes[$hash] = $hash_url;
- $pre_out_pages[$hash_url][0][self::HASH_URL_COUNT] =
- count($pre_out_pages[$hash_url]);
+ unset($pre_out_pages[$previous_url]);
}
+ } else {
+ $this->current_seen_hashes[$hash] = $hash_url;
}
}
- $this->count_block = count($pre_out_pages);
+ }
- /*
- Calculate grouped values for each field of the groups we found
- */
- $out_pages = array();
- foreach($pre_out_pages as $hash_url => $group_infos) {
- foreach($group_infos as $doc_info) {
- $is_page = ($doc_info[self::HASH] !== false) ? true : false;
- if(!isset($out_pages[$hash_url]) || $is_page) {
- if(isset($out_pages[$hash_url]) && $is_page) {
- $rank = $out_pages[$hash_url][self::DOC_RANK];
- $relevance = $out_pages[$hash_url][self::RELEVANCE];
- $out_pages[$hash_url] = $doc_info;
- $out_pages[$hash_url][self::DOC_RANK] += $rank;
- $out_pages[$hash_url][self::RELEVANCE] +=
- $relevance;
- $out_pages[$hash_url][self::SCORE] +=
- $rank + $relevance;
- } else {
- $out_pages[$hash_url] = $doc_info;
- }
- $out_pages[$hash_url][self::SUMMARY_OFFSET] = array();
- if(isset($doc_info[self::SUMMARY_OFFSET]) &&
- isset($doc_info[self::GENERATION])) {
- $out_pages[$hash_url][self::SUMMARY_OFFSET] =
- array(array($doc_info["KEY"],
- $doc_info[self::GENERATION],
- $doc_info[self::SUMMARY_OFFSET]));
- unset($out_pages[$hash_url]["KEY"]);
- }
- } else {
- $fields = array_keys($out_pages[$hash_url]);
- foreach($fields as $field) {
- if(isset($doc_info[$field]) &&
- $field != self::SUMMARY_OFFSET &&
- $field != self::GENERATION) {
- $out_pages[$hash_url][$field] +=
- $doc_info[$field];
- } else if($field == self::SUMMARY_OFFSET) {
- $out_pages[$hash_url][$field][] =
- array($doc_info["KEY"],
- $doc_info[self::GENERATION],
- $doc_info[$field]);
- }
- }
- }
- }
+ }
+
+ /**
+ * For a collection of grouped pages generates a grouped summary for each
+ * group and returns an array of out pages consisting
+ * of single summarized documents for each group. These single summarized
+ * documents have aggregated scores to which a "boost" has been added.
+ * For a single summarized page, its boost is an estimate of the score of
+ * the pages that would have been grouped with it had more pages from the
+ * underlying iterator been examined. This is calculated by looking at
+ * total number of inlinks to the page, estimating how many of these inlinks
+ * would have been returned by the current iterator, then assuming the
+ * scores of these pages follow a Zipfian distribution and computing the
+ * appropriate integral.
+ *
+ * @param array &$pre_out_pages array of groups of pages for which out pages
+ * are to be generated.
+ * @return array $out_pages array of single summarized documents to which a
+ * "boost" has been applied
+ */
+ function computeBoostAndOutPages(&$pre_out_pages)
+ {
+ $out_pages = array();
+ foreach($pre_out_pages as $hash_url => $group_infos) {
+ $out_pages[$hash_url] = $pre_out_pages[$hash_url][0];
+ $out_pages[$hash_url][self::SUMMARY_OFFSET] = array();
+ unset($out_pages[$hash_url][self::GENERATION]);
+ for($i = 0; $i <
+ $pre_out_pages[$hash_url][0][self::HASH_URL_COUNT]; $i ++) {
+ $doc_info = $group_infos[$i];
+ $out_pages[$hash_url][self::SUMMARY_OFFSET][] =
+ array($doc_info["KEY"], $doc_info[self::GENERATION],
+ $doc_info[self::SUMMARY_OFFSET]);
}
- $pages = $out_pages;
+
+ if($this->count_block_unfiltered >=$this->results_per_block) {
+ /* approximate the scores contributed to this
+ doc for this word search by links we haven't
+ reached in our grouping
+ */
+ $word_iterator = new WordIterator(
+ $out_pages[$hash_url][self::INLINKS],
+ $this->getIndex(), true);
+ $num_inlinks = $word_iterator->num_docs + 0.1;
+ $num_docs_seen = $this->seen_docs_unfiltered +
+ $this->count_block_unfiltered;
+
+ $hash_count = $out_pages[$hash_url][self::HASH_URL_COUNT];
+
+ /*
+ An attempt to approximate the total number of inlinks
+ to a document which will have the terms in question.
+ */
+ $total_inlinks_for_doc = min($num_inlinks,
+ $hash_count * $this->num_docs/$num_docs_seen);
+
+ /*
+ we score[x] of the xth inlink for this document
+ is approximately score[x] = score[1]x^{-alpha}
+ If n = $total_inlinks_for_doc, then by integrating this
+ from k = self::HASH_URL_COUNT to n, we get an
+ approximation for the score we haven't seen (which
+ we call the boost).
+ boost = score[1](n^{1-alpha} - k^{1-alpha})/(1-alpha)
+ let exponent = 1-alpha
+ */
+ $max_score = $out_pages[$hash_url][self::MAX];
+ $exponent = 1 - ((log($max_score) -
+ log($out_pages[$hash_url][self::MIN]))/ log($num_inlinks));
+ $boost = $max_score*(pow($total_inlinks_for_doc, $exponent) -
+ pow($hash_count, $exponent))/$exponent;
+ /*
+ although relevance is a log based quantity we want to
+ further penalize docs with a high rank but low relevance for
+ the underlying iterator, so we weighted higher order average
+ */
+ $out_pages[$hash_url][self::SCORE] =
+ ($out_pages[$hash_url][self::HASH_SUM_SCORE] + $boost) *
+ (1 + $out_pages[$hash_url][self::RELEVANCE])/2;
+ } else {
+ $out_pages[$hash_url][self::SCORE] =
+ $out_pages[$hash_url][self::HASH_SUM_SCORE] *
+ (1 + $out_pages[$hash_url][self::RELEVANCE])/2;
+ }
+
}
- $this->pages = $pages;
- return $pages;
+ return $out_pages;
+ }
+ /**
+ * For a collection of pages each with the same url, computes the page
+ * with the min score, max score, as well as the sum of the score,
+ * sum of the ranks, sum of the relevance score, and count. Stores this
+ * information in the first element of the array of pages.
+ */
+ function aggregateScores(&$pre_hash_page)
+ {
+ $sum_score = 0;
+ $sum_rank = 0;
+ $sum_relevance = 0;
+ $min = 1000000; //no score will be this big
+ $max = -1;
+ foreach($pre_hash_page as $hash_page) {
+ if(isset($hash_page[self::SCORE])) {
+ $current_score = $hash_page[self::SCORE];
+ $min = ($current_score < $min ) ? $current_score : $min;
+ $max = ($max < $current_score ) ? $current_score : $max;
+ $sum_score += $current_score;
+ $sum_rank += $hash_page[self::DOC_RANK];
+ $sum_relevance += $hash_page[self::RELEVANCE];
+ }
+ }
+ $pre_hash_page[0][self::MIN] = $min;
+ $pre_hash_page[0][self::MAX] = $max;
+ $pre_hash_page[0][self::HASH_SUM_SCORE] = $sum_score;
+ $pre_hash_page[0][self::DOC_RANK] = $sum_rank;
+ $pre_hash_page[0][self::RELEVANCE] = $sum_relevance;
+ $pre_hash_page[0][self::HASH_URL_COUNT] = count($pre_hash_page);
}
/**
diff --git a/lib/index_shard.php b/lib/index_shard.php
index 632b645c5..1b31bac59 100644
--- a/lib/index_shard.php
+++ b/lib/index_shard.php
@@ -227,7 +227,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
* Used to keep track of whether a record in document infos is for a
* document or for a link
*/
- const COMPOSITE_ID_FLAG = 0x80000000;
+ const LINK_FLAG = 0x800000;
/**
* Size in bytes of one block in IndexShard
@@ -315,12 +315,12 @@ class IndexShard extends PersistentStructure implements CrawlConstants
* @return bool success or failure of performing the add
*/
function addDocumentWords($doc_keys, $summary_offset, $word_counts,
- $meta_ids)
+ $meta_ids, $is_doc = false)
{
if($this->word_docs_packed == true) {
$this->unpackWordDocs();
}
- $is_doc = false;
+
$doc_len = 0;
$link_doc_len = 0;
$len_key = strlen($doc_keys);
@@ -336,12 +336,10 @@ class IndexShard extends PersistentStructure implements CrawlConstants
$added_len = strlen($summary_offset_string);
$this->doc_infos .= $summary_offset_string;
- if($num_keys <= 2) {
+ if($is_doc) {
$this->num_docs++;
- $is_doc = true;
} else { //link item
$this->num_link_docs++;
- $is_doc = false;
}
foreach($meta_ids as $meta_id) {
$word_counts[$meta_id] = 0;
@@ -369,7 +367,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
$this->len_all_docs += $doc_len;
$this->len_all_link_docs += $link_doc_len;
$len_num_keys = ($is_doc) ? $this->packPosting($doc_len, $num_keys) :
- $this->packPosting($link_doc_len, $num_keys);
+ $this->packPosting((self::LINK_FLAG | $link_doc_len), $num_keys);
$this->doc_infos .= $len_num_keys;
$added_len += strlen($len_num_keys);
$this->doc_infos .= $doc_keys;
@@ -494,7 +492,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants
list($doc_len, $num_keys) =
$this->unpackPosting(substr($doc_info_string, 4));
$item[self::GENERATION] = $this->generation;
- $is_doc = ($num_keys <= 2) ? true :false;
+ $is_doc = (($doc_len & self::LINK_FLAG) == 0) ? true : false;
+ if(!$is_doc) {$doc_len -= self::LINK_FLAG; }
+ $item[self::IS_DOC] = $is_doc;
$skip_stats = false;
if($item[self::SUMMARY_OFFSET] == self::NEEDS_OFFSET_FLAG) {
@@ -760,7 +760,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
$num_words++;
} else {
if($old_prefix !== false) {
- $tmp[$old_prefix] = pack("N", $offset) .
+ $tmp[$old_prefix] = packInt($offset) .
pack("N", $num_words);
$offset += $num_words * $word_item_len;
}
@@ -768,7 +768,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
$num_words = 1;
}
}
- $tmp[$old_prefix] = pack("N", $offset) . pack("N", $num_words);
+ $tmp[$old_prefix] = packInt($offset) . packInt($num_words);
$num_prefixes = 2 << 16;
$this->prefixes = "";
for($i = 0; $i < $num_prefixes; $i++) {
diff --git a/lib/utility.php b/lib/utility.php
index e885dec49..f5bddaeae 100755
--- a/lib/utility.php
+++ b/lib/utility.php
@@ -111,6 +111,29 @@ function packInt($my_int)
return pack("N", $my_int);
}
+/**
+ * Unpacks a float from a 4 char string
+ *
+ * @param string $str where to extract int from
+ * @return float extracted float
+ */
+function unpackFloat($str)
+{
+ $tmp = unpack("f", $str);
+ return $tmp[1];
+}
+
+/**
+ * Packs an float into a 4 char string
+ *
+ * @param float $my_floatt the float to pack
+ * @return string the packed string
+ */
+function packFloat($my_float)
+{
+ return pack("f", $my_float);
+}
+
/**
* Converts a string to string where each char has been replaced by its
* hexadecimal equivalent