Refactors and further improves group iterator and index shard, bumps the default ini_set for queue_server memory, a=chris

Chris Pollett [2011-01-05 17:Jan:th]
Refactors and further improves group iterator and index shard, bumps the default ini_set for queue_server memory, a=chris
Filename
bin/fetcher.php
bin/queue_server.php
lib/crawl_constants.php
lib/index_bundle_iterators/group_iterator.php
lib/index_shard.php
lib/utility.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index deea0c19e..30f0b2bfb 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -466,13 +466,10 @@ class Fetcher implements CrawlConstants
             $this->to_crawl = array();
             while($tok !== false) {
                 $string = base64_decode($tok);
-                $tmp = unpack("f", substr($string, 0 , 4));
-                $weight = $tmp[1];
-                $tmp = unpack("N", substr($string, 4 , 4));
-                $delay = $tmp[1];
+                $weight = unpackFloat(substr($string, 0 , 4));
+                $delay = unpackInt(substr($string, 4 , 4));
                 $url = substr($string, 8);
                 $this->to_crawl[] = array($url, $weight, $delay);
-
                 $tok = strtok("\n");
             }
         }
@@ -1083,7 +1080,7 @@ class Fetcher implements CrawlConstants
             $site = $this->found_sites[self::SEEN_URLS][$i];
             if(!isset($site[self::HASH])) {continue; }
             $doc_keys = crawlHash($site[self::URL], true) .
-                $site[self::HASH];
+                $site[self::HASH]. crawlHash("link:".$site[self::URL], true);
             $word_counts = array();
             $phrase_string =
                 mb_ereg_replace("[[:punct:]]", " ", $site[self::TITLE] .
@@ -1153,8 +1150,8 @@ class Fetcher implements CrawlConstants
                     $link_text = strip_tags($link_text);
                     $link_id =
                         "url|".$url."|text|$link_text|ref|".$site[self::URL];
-                    $link_keys =  crawlHash($link_id, true) .
-                        crawlHash($url, true) .
+                    $link_keys = crawlHash($url, true) .
+                        crawlHash($link_id, true) .
                         crawlHash("info:".$url, "true");
                     $summary[self::URL] =  $link_id;
                     $summary[self::TITLE] = $url;
@@ -1162,10 +1159,7 @@ class Fetcher implements CrawlConstants
                     $summary[self::DESCRIPTION] =  $link_text;
                     $summary[self::TIMESTAMP] =  $site[self::TIMESTAMP];
                     $summary[self::ENCODING] = $site[self::ENCODING];
-                    $summary[self::HASH] =  false; /*
-                        link id's will always be unique so no sense
-                        deduplicating them
-                    */
+                    $summary[self::HASH] =  $link_id;
                     $summary[self::TYPE] = "link";
                     $summary[self::HTTP_CODE] = "link";
                     $this->found_sites[self::SEEN_URLS][] = $summary;
@@ -1183,7 +1177,7 @@ class Fetcher implements CrawlConstants

             }
             $index_shard->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG,
-                $word_counts, $meta_ids);
+                $word_counts, $meta_ids, true);

             $index_shard->appendIndexShard($link_shard);

diff --git a/bin/queue_server.php b/bin/queue_server.php
index d17944161..d9aaf65cf 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -38,7 +38,7 @@ define("BASE_DIR", substr(
     dirname(realpath($_SERVER['PHP_SELF'])), 0,
     -strlen("/bin")));

-ini_set("memory_limit","950M"); //so have enough memory to crawl big pages
+ini_set("memory_limit","1100M"); //so have enough memory to crawl big pages

 /** Load in global configuration settings */
 require_once BASE_DIR.'/configs/config.php';
@@ -595,16 +595,18 @@ class QueueServer implements CrawlConstants
             if(strcmp("url", $link_url_parts[0]) == 0 &&
                 strcmp("text", $link_url_parts[2]) == 0) {
                 $seen_sites[$i][self::HASH_URL] =
-                    crawlHash($seen_sites[$i][self::URL], true)
-                    . crawlHash($link_url_parts[1],true)
+                    crawlHash($link_url_parts[1],true)
+                    . crawlHash($seen_sites[$i][self::URL], true)
                     . crawlHash("info:".$link_url_parts[1], true);
+                $seen_sites[$i][self::IS_DOC] = false;
             } else {
+                $seen_sites[$i][self::IS_DOC] = true;
                 $visited_urls_count++;
             }
         }

         if(isset($sites[self::INVERTED_INDEX])) {
-            $index_shard =  $sites[self::INVERTED_INDEX];
+            $index_shard =  & $sites[self::INVERTED_INDEX];
             if($index_shard->word_docs_packed) {
                 $index_shard->unpackWordDocs();
             }
@@ -618,8 +620,10 @@ class QueueServer implements CrawlConstants
                     $visited_urls_count);

                 foreach($seen_sites as $site) {
-                    if($site[self::HASH] !== false){ // so not link
-                        $hash = $site[self::HASH_URL].$site[self::HASH];
+                    if($site[self::IS_DOC]){ // so not link
+                        $hash = $site[self::HASH_URL].
+                            $site[self::HASH] .
+                            crawlHash("link:".$site[self::URL], true);
                     } else {
                         $hash = $site[self::HASH_URL];
                     }
@@ -1153,7 +1157,7 @@ class QueueServer implements CrawlConstants
             foreach($sites[self::SITES] as $site) {
                 list($url, $weight, $delay) = $site;
                 $out_string = base64_encode(
-                    pack("f", $weight).pack("N", $delay).$url)."\n";
+                    packFloat($weight).packInt($delay).$url)."\n";
                 fwrite($fh, $out_string);
             }
             fclose($fh);
diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php
index 61a462f7c..95cba7659 100644
--- a/lib/crawl_constants.php
+++ b/lib/crawl_constants.php
@@ -136,7 +136,11 @@ interface CrawlConstants
     const META_WORDS ='ao';
     const CACHE_PAGE_PARTITION = 'ap';
     const GENERATION = 'aq';
-    const HASH_URL_COUNT = 'ar';
+    const HASH_SUM_SCORE = 'ar';
+    const HASH_URL_COUNT = 'as';
+    const IS_DOC = 'at';
+    const BOOST = 'av';
+

     const NEEDS_OFFSET_FLAG = 0x7FFFFFFF;

diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index e3cec25f8..d2dc756af 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -154,13 +154,52 @@ class GroupIterator extends IndexBundleIterator
      * @return mixed doc ids and score if there are docs left, -1 otherwise
      */
     function findDocsWithWord()
+    {
+        // first get a block of documents on which grouping can be done
+        $pages =  $this->getPagesToGroup();
+        $this->count_block_unfiltered = count($pages);
+        if(!is_array($pages)) {
+            return $pages;
+        }
+
+        $this->current_block_hashes = array();
+
+        $this->current_seen_hashes = array();
+        if($this->count_block_unfiltered > 0 ) {
+            /* next we group like documents by url and remember which urls we've
+               seen this block
+            */
+            $pre_out_pages = $this->groupByHashUrl($pages);
+
+           /*get doc page for groups of link data if exists and don't have
+             also aggregate by hash
+           */
+           $this->groupByHashAndAggregate($pre_out_pages);
+           $this->count_block = count($pre_out_pages);
+
+            /*
+                Calculate aggregate values for each field of the groups we found
+             */
+            $pages = $this->computeBoostAndOutPages($pre_out_pages);
+        }
+        $this->pages = $pages;
+        return $pages;
+
+    }
+
+    /**
+     * Gets a sample of a few hundred pages on which to do grouping by URL
+     *
+     * @return array of pages of document key --> meta data arrays
+     */
+    function getPagesToGroup()
     {
         $pages = array();
         $count = 0;
         $done = false;
-        // first get a block of documents on which grouping can be done
         do {
             $new_pages = $this->index_bundle_iterator->currentDocsWithWord();
+
             if(!is_array($new_pages)) {
                 $done = true;
                 if(count($pages) == 0) {
@@ -176,167 +215,234 @@ class GroupIterator extends IndexBundleIterator
                 $done = true;
             }
         } while(!$done);
-        $this->count_block_unfiltered = count($pages);
-        if(!is_array($pages)) {
-            return $pages;
-        }
+        return $pages;
+    }

-        /* next we group like documents by url and remember which urls we've
-           seen this block
-        */
-        $this->current_block_hashes = array();
+    /**
+     * Groups documents as well as mini-pages based on links to documents by
+     * url to produce an array of arrays of documents with same url. Since
+     * this is called in an iterator, documents which were already returned by
+     * a previous call to currentDocsWithWord() followed by an advance() will
+     * have been remembered in grouped_keys and will be ignored in the return
+     * result of this function.
+     *
+     * @param array &$pages pages to group
+     * @return array $pre_out_pages pages after grouping
+     */
+    function groupByHashUrl(&$pages)
+    {
         $pre_out_pages = array();
-        $this->current_seen_hashes = array();
-        if($this->count_block_unfiltered > 0 ) {
-            $i = $this->seen_docs;
-            foreach($pages as $doc_key => $doc_info) {
-                if(!is_array($doc_info) || $doc_info[self::SUMMARY_OFFSET] ==
-                    self::NEEDS_OFFSET_FLAG) {continue;}
-                $doc_info['KEY'] = $doc_key;
-                if(strlen($doc_key) == 16) {
-                    $hash_url = substr($doc_key, 0, 8);
-                    $doc_info[self::HASH] = substr($doc_key, 8);
-                    if(!isset($pre_out_pages[$hash_url])) {
-                        $pre_out_pages[$hash_url] = array();
-                    }
-                    array_unshift($pre_out_pages[$hash_url], $doc_info);
-                } else {
-                    $doc_key_parts = array(
-                        substr($doc_key, 0, 8),substr($doc_key, 8, 8),
-                        substr($doc_key, 16, 8)
-                    );
-                    $hash_url = $doc_key_parts[1];
-                    $doc_info[self::HASH] = false;
-                    $pre_out_pages[$hash_url][] = $doc_info;
-                }
-
-                if($doc_info[self::HASH] !== false) {
-                    $pre_out_pages[$hash_url]['IS_PAGE'] = true;
-                } else {
-                    $pre_out_pages[$hash_url]['HASH_INFO_URL'] =
-                        $doc_key_parts[2];
+        foreach($pages as $doc_key => $doc_info) {
+            if(!is_array($doc_info) || $doc_info[self::SUMMARY_OFFSET] ==
+                self::NEEDS_OFFSET_FLAG) {continue;}
+            $doc_info['KEY'] = $doc_key;
+            $hash_url = substr($doc_key, 0, IndexShard::DOC_KEY_LEN);
+            $doc_info[self::HASH] = substr($doc_key,
+                IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN);
+            $doc_info[self::INLINKS] = substr($doc_key,
+                2*IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN);
+            if($doc_info[self::IS_DOC]) {
+                if(!isset($pre_out_pages[$hash_url])) {
+                    $pre_out_pages[$hash_url] = array();
                 }
+                array_unshift($pre_out_pages[$hash_url], $doc_info);
+            } else {
+                $pre_out_pages[$hash_url][] = $doc_info;
+            }

-                if(!isset($this->grouped_keys[$hash_url])) {
-                    /*
-                        new urls found in this block
-                    */
-                    $this->current_block_hashes[] = $hash_url;
-                } else {
-                    unset($pre_out_pages[$hash_url]);
-                }
+            if(!isset($this->grouped_keys[$hash_url])) {
+               /*
+                    new urls found in this block
+                */
+                $this->current_block_hashes[] = $hash_url;
+            } else {
+                unset($pre_out_pages[$hash_url]);
             }
+        }

-             /*get summary page for groups of link data if exists and don't have
-               also aggregate by hash
-             */
-            $this->current_seen_hashes = array();
-            foreach($pre_out_pages as $hash_url => $data) {
-                if(!isset($pre_out_pages[$hash_url]['IS_PAGE'])) {
-                    $hash_info_url= $pre_out_pages[$hash_url]['HASH_INFO_URL'];
-                    $word_iterator =
-                         new WordIterator($hash_info_url,
-                            $this->getIndex(), true);
-                    $doc_array = $word_iterator->currentDocsWithWord();
-
-                    if(is_array($doc_array) && count($doc_array) == 1) {
-                        $relevance =  $this->computeRelevance(
-                            $word_iterator->current_generation,
-                            $word_iterator->current_offset);
-                        $keys = array_keys($doc_array);
-                        $key = $keys[0];
-                        $item = $doc_array[$key];
-                        $item[self::RELEVANCE] += $relevance;
-                        $item[self::SCORE] += $relevance;
-                        $item['KEY'] = substr($key, 0, 8);
-                        $item[self::HASH] = substr($key, 8, 8);
-                        array_unshift($pre_out_pages[$hash_url], $item);
-                    }
-                } else {
-                        unset($pre_out_pages[$hash_url]['IS_PAGE']);
-                }
-                if(isset($pre_out_pages[$hash_url]['HASH_INFO_URL'])) {
-                    unset($pre_out_pages[$hash_url]['HASH_INFO_URL']);
+        return $pre_out_pages;
+    }
+
+    /**
+     * For documents which had been previously grouped by the hash of their
+     * url, groups these groups further by the hash of their pages contents.
+     * For each group of groups with the same hash summary, this function
+     * then selects the subgroup of with the highest aggregate score for
+     * that group as its representative. The function then modifies the
+     * supplied argument array to make it an array of group representatives.
+     *
+     * @param array &$pre_out_pages documents previously grouped by hash of url
+     */
+     function groupByHashAndAggregate(&$pre_out_pages)
+     {
+         foreach($pre_out_pages as $hash_url => $data) {
+            if(!$pre_out_pages[$hash_url][0][self::IS_DOC]) {
+                $hash_info_url=
+                    $pre_out_pages[$hash_url][0][self::INLINKS];
+                $word_iterator =
+                     new WordIterator($hash_info_url,
+                        $this->getIndex(), true);
+                $doc_array = $word_iterator->currentDocsWithWord();
+
+                if(is_array($doc_array) && count($doc_array) == 1) {
+                    $relevance =  $this->computeRelevance(
+                        $word_iterator->current_generation,
+                        $word_iterator->current_offset);
+                    $keys = array_keys($doc_array);
+                    $key = $keys[0];
+                    $item = $doc_array[$key];
+                    $item[self::RELEVANCE] = $relevance;
+                    $item[self::SCORE] += $relevance;
+                    $item['KEY'] = $key;
+                    $item[self::HASH] = substr($key,
+                        IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN);
+                    $item[self::INLINKS] = substr($key,
+                        2*IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN);
+                    array_unshift($pre_out_pages[$hash_url], $item);
                 }
-                if(isset($pre_out_pages[$hash_url][0][self::HASH])) {
-                    $hash = $pre_out_pages[$hash_url][0][self::HASH];
-                    if(isset($this->grouped_hashes[$hash])) {
+            }
+
+            $this->aggregateScores($pre_out_pages[$hash_url]);
+
+            if(isset($pre_out_pages[$hash_url][0][self::HASH])) {
+                $hash = $pre_out_pages[$hash_url][0][self::HASH];
+                if(isset($this->grouped_hashes[$hash])) {
+                    unset($pre_out_pages[$hash_url]);
+                } else if(isset($this->current_seen_hashes[$hash])) {
+                    $previous_url = $this->current_seen_hashes[$hash];
+                    if($pre_out_pages[$previous_url][0][
+                        self::HASH_SUM_SCORE] >=
+                        $pre_out_pages[$hash_url][0][self::HASH_SUM_SCORE]) {
                         unset($pre_out_pages[$hash_url]);
-                    } else if(isset($this->current_seen_hashes[$hash])) {
-                        $previous_url = $this->current_seen_hashes[$hash];
-                        if($pre_out_pages[$previous_url][0][
-                            self::HASH_URL_COUNT] >=
-                            count($pre_out_pages[$hash_url])) {
-                            unset($pre_out_pages[$hash_url]);
-                        } else {
-                            $this->current_seen_hashes[$hash] = $hash_url;
-                            $pre_out_pages[$hash_url][0][self::HASH_URL_COUNT] =
-                                count($pre_out_pages[$hash_url]);
-                            unset($pre_out_pages[$previous_url]);
-                        }
                     } else {
-                        $i++;
                         $this->current_seen_hashes[$hash] = $hash_url;
-                        $pre_out_pages[$hash_url][0][self::HASH_URL_COUNT] =
-                            count($pre_out_pages[$hash_url]);
+                        unset($pre_out_pages[$previous_url]);
                     }
+                } else {
+                    $this->current_seen_hashes[$hash] = $hash_url;
                 }
             }
-            $this->count_block = count($pre_out_pages);
+        }

-            /*
-                Calculate grouped values for each field of the groups we found
-             */
-            $out_pages = array();
-            foreach($pre_out_pages as $hash_url => $group_infos) {
-                foreach($group_infos as $doc_info) {
-                    $is_page = ($doc_info[self::HASH] !== false) ? true : false;
-                    if(!isset($out_pages[$hash_url]) || $is_page) {
-                        if(isset($out_pages[$hash_url]) && $is_page) {
-                            $rank = $out_pages[$hash_url][self::DOC_RANK];
-                            $relevance = $out_pages[$hash_url][self::RELEVANCE];
-                            $out_pages[$hash_url] = $doc_info;
-                            $out_pages[$hash_url][self::DOC_RANK] += $rank;
-                            $out_pages[$hash_url][self::RELEVANCE] +=
-                                $relevance;
-                            $out_pages[$hash_url][self::SCORE] +=
-                                $rank + $relevance;
-                        } else {
-                            $out_pages[$hash_url] = $doc_info;
-                        }
-                        $out_pages[$hash_url][self::SUMMARY_OFFSET] = array();
-                        if(isset($doc_info[self::SUMMARY_OFFSET]) &&
-                          isset($doc_info[self::GENERATION])) {
-                            $out_pages[$hash_url][self::SUMMARY_OFFSET] =
-                                array(array($doc_info["KEY"],
-                                    $doc_info[self::GENERATION],
-                                    $doc_info[self::SUMMARY_OFFSET]));
-                            unset($out_pages[$hash_url]["KEY"]);
-                        }
-                    } else {
-                        $fields = array_keys($out_pages[$hash_url]);
-                        foreach($fields as $field) {
-                            if(isset($doc_info[$field]) &&
-                                $field != self::SUMMARY_OFFSET &&
-                                $field != self::GENERATION) {
-                                $out_pages[$hash_url][$field] +=
-                                    $doc_info[$field];
-                            } else if($field == self::SUMMARY_OFFSET) {
-                                $out_pages[$hash_url][$field][] =
-                                    array($doc_info["KEY"],
-                                        $doc_info[self::GENERATION],
-                                        $doc_info[$field]);
-                            }
-                        }
-                    }
-                }
+     }
+
+    /**
+     * For a collection of grouped pages generates a grouped summary for each
+     * group and returns an array of out pages consisting
+     * of single summarized documents for each group. These single summarized
+     * documents have aggregated scores to which a "boost" has been added.
+     * For a single summarized page, its boost is an estimate of the score of
+     * the pages that would have been grouped with it had more pages from the
+     * underlying iterator been examined. This is calculated by looking at
+     * total number of inlinks to the page, estimating how many of these inlinks
+     * would have been returned by the current iterator, then assuming the
+     * scores of these pages follow a Zipfian distribution and computing the
+     * appropriate integral.
+     *
+     * @param array &$pre_out_pages array of groups of pages for which out pages
+     *      are to be generated.
+     * @return array $out_pages array of single summarized documents to which a
+     *      "boost" has been applied
+     */
+    function computeBoostAndOutPages(&$pre_out_pages)
+    {
+        $out_pages = array();
+        foreach($pre_out_pages as $hash_url => $group_infos) {
+            $out_pages[$hash_url] = $pre_out_pages[$hash_url][0];
+            $out_pages[$hash_url][self::SUMMARY_OFFSET] = array();
+            unset($out_pages[$hash_url][self::GENERATION]);
+            for($i = 0; $i <
+                $pre_out_pages[$hash_url][0][self::HASH_URL_COUNT]; $i ++) {
+                $doc_info = $group_infos[$i];
+                $out_pages[$hash_url][self::SUMMARY_OFFSET][] =
+                    array($doc_info["KEY"], $doc_info[self::GENERATION],
+                        $doc_info[self::SUMMARY_OFFSET]);
             }
-            $pages = $out_pages;
+
+            if($this->count_block_unfiltered >=$this->results_per_block) {
+                /* approximate the scores contributed to this
+                   doc for this word search by links we haven't
+                   reached in our grouping
+                */
+                $word_iterator = new WordIterator(
+                    $out_pages[$hash_url][self::INLINKS],
+                    $this->getIndex(), true);
+                $num_inlinks = $word_iterator->num_docs + 0.1;
+                $num_docs_seen = $this->seen_docs_unfiltered +
+                    $this->count_block_unfiltered;
+
+                $hash_count = $out_pages[$hash_url][self::HASH_URL_COUNT];
+
+                /*
+                    An attempt to approximate the total number of inlinks
+                    to a document which will have the terms in question.
+                 */
+                $total_inlinks_for_doc = min($num_inlinks,
+                    $hash_count * $this->num_docs/$num_docs_seen);
+
+                /*
+                     we score[x] of the xth inlink for this document
+                     is approximately score[x] = score[1]x^{-alpha}
+                     If n = $total_inlinks_for_doc, then by integrating this
+                     from k = self::HASH_URL_COUNT to n, we get an
+                     approximation for the score we haven't seen (which
+                     we call the boost).
+                     boost = score[1](n^{1-alpha} - k^{1-alpha})/(1-alpha)
+                     let exponent = 1-alpha
+                */
+                $max_score = $out_pages[$hash_url][self::MAX];
+                $exponent = 1 - ((log($max_score) -
+                    log($out_pages[$hash_url][self::MIN]))/ log($num_inlinks));
+                $boost = $max_score*(pow($total_inlinks_for_doc, $exponent) -
+                    pow($hash_count, $exponent))/$exponent;
+                /*
+                  although relevance is  a log based quantity we want to
+                  further penalize docs with a high rank but low relevance for
+                  the underlying iterator, so we weighted higher order average
+                 */
+                $out_pages[$hash_url][self::SCORE] =
+                    ($out_pages[$hash_url][self::HASH_SUM_SCORE] + $boost) *
+                    (1 + $out_pages[$hash_url][self::RELEVANCE])/2;
+            } else {
+                $out_pages[$hash_url][self::SCORE] =
+                    $out_pages[$hash_url][self::HASH_SUM_SCORE] *
+                    (1 + $out_pages[$hash_url][self::RELEVANCE])/2;
+            }
+
         }

-        $this->pages = $pages;
-        return $pages;
+        return $out_pages;
+    }

+    /**
+     * For a collection of pages each with the same url, computes the page
+     * with the min score, max score, as well as the sum of the score,
+     * sum of the ranks, sum of the relevance score, and count. Stores this
+     * information in the first element of the array of pages.
+     */
+    function aggregateScores(&$pre_hash_page)
+    {
+        $sum_score = 0;
+        $sum_rank = 0;
+        $sum_relevance = 0;
+        $min = 1000000; //no score will be this big
+        $max = -1;
+        foreach($pre_hash_page as $hash_page) {
+            if(isset($hash_page[self::SCORE])) {
+                $current_score = $hash_page[self::SCORE];
+                $min = ($current_score < $min ) ? $current_score : $min;
+                $max = ($max < $current_score ) ? $current_score : $max;
+                $sum_score += $current_score;
+                $sum_rank += $hash_page[self::DOC_RANK];
+                $sum_relevance += $hash_page[self::RELEVANCE];
+            }
+        }
+        $pre_hash_page[0][self::MIN] = $min;
+        $pre_hash_page[0][self::MAX] = $max;
+        $pre_hash_page[0][self::HASH_SUM_SCORE] = $sum_score;
+        $pre_hash_page[0][self::DOC_RANK] = $sum_rank;
+        $pre_hash_page[0][self::RELEVANCE] = $sum_relevance;
+        $pre_hash_page[0][self::HASH_URL_COUNT] = count($pre_hash_page);
     }

     /**
diff --git a/lib/index_shard.php b/lib/index_shard.php
index 632b645c5..1b31bac59 100644
--- a/lib/index_shard.php
+++ b/lib/index_shard.php
@@ -227,7 +227,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      * Used to keep track of whether a record in document infos is for a
      * document or for a link
      */
-    const COMPOSITE_ID_FLAG =  0x80000000;
+    const LINK_FLAG =  0x800000;

     /**
      * Size in bytes of one block in IndexShard
@@ -315,12 +315,12 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      * @return bool success or failure of performing the add
      */
     function addDocumentWords($doc_keys, $summary_offset, $word_counts,
-        $meta_ids)
+        $meta_ids, $is_doc = false)
     {
         if($this->word_docs_packed == true) {
             $this->unpackWordDocs();
         }
-        $is_doc = false;
+
         $doc_len = 0;
         $link_doc_len = 0;
         $len_key = strlen($doc_keys);
@@ -336,12 +336,10 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         $added_len = strlen($summary_offset_string);
         $this->doc_infos .= $summary_offset_string;

-        if($num_keys <= 2) {
+        if($is_doc) {
             $this->num_docs++;
-            $is_doc = true;
         } else { //link item
             $this->num_link_docs++;
-            $is_doc = false;
         }
         foreach($meta_ids as $meta_id) {
             $word_counts[$meta_id] = 0;
@@ -369,7 +367,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         $this->len_all_docs += $doc_len;
         $this->len_all_link_docs += $link_doc_len;
         $len_num_keys = ($is_doc) ? $this->packPosting($doc_len, $num_keys) :
-            $this->packPosting($link_doc_len, $num_keys);
+            $this->packPosting((self::LINK_FLAG | $link_doc_len), $num_keys);
         $this->doc_infos .=  $len_num_keys;
         $added_len += strlen($len_num_keys);
         $this->doc_infos .= $doc_keys;
@@ -494,7 +492,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         list($doc_len, $num_keys) =
             $this->unpackPosting(substr($doc_info_string, 4));
         $item[self::GENERATION] = $this->generation;
-        $is_doc = ($num_keys <= 2) ? true :false;
+        $is_doc = (($doc_len & self::LINK_FLAG) == 0) ? true : false;
+        if(!$is_doc) {$doc_len -= self::LINK_FLAG; }
+        $item[self::IS_DOC] = $is_doc;
         $skip_stats = false;

         if($item[self::SUMMARY_OFFSET] == self::NEEDS_OFFSET_FLAG) {
@@ -760,7 +760,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
                 $num_words++;
             } else {
                 if($old_prefix !== false) {
-                    $tmp[$old_prefix] = pack("N", $offset) .
+                    $tmp[$old_prefix] = packInt($offset) .
                         pack("N", $num_words);
                     $offset += $num_words * $word_item_len;
                 }
@@ -768,7 +768,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
                 $num_words = 1;
             }
         }
-        $tmp[$old_prefix] = pack("N", $offset) . pack("N", $num_words);
+        $tmp[$old_prefix] = packInt($offset) . packInt($num_words);
         $num_prefixes = 2 << 16;
         $this->prefixes = "";
         for($i = 0; $i < $num_prefixes; $i++) {
diff --git a/lib/utility.php b/lib/utility.php
index e885dec49..f5bddaeae 100755
--- a/lib/utility.php
+++ b/lib/utility.php
@@ -111,6 +111,29 @@ function packInt($my_int)
     return pack("N", $my_int);
 }

+/**
+ * Unpacks a float from a 4 char string
+ *
+ * @param string $str where to extract int from
+ * @return float extracted float
+ */
+function unpackFloat($str)
+{
+    $tmp = unpack("f", $str);
+    return $tmp[1];
+}
+
+/**
+ * Packs an float into a 4 char string
+ *
+ * @param float $my_floatt the float to pack
+ * @return string the packed string
+ */
+function packFloat($my_float)
+{
+    return pack("f", $my_float);
+}
+
 /**
  * Converts a string to string where each char has been replaced by its
  * hexadecimal equivalent
ViewGit