Changed indexing to use use index shards, a=chris

Chris Pollett [2010-10-11 15:Oct:th]

Changed indexing to use use index shards, a=chris

Filename
bin/fetcher.php
bin/queue_server.php
configs/config.php
controllers/search_controller.php
lib/bloom_filter_bundle.php
lib/bloom_filter_file.php
lib/crawl_constants.php
lib/index_archive_bundle.php
lib/index_bundle_iterators/group_iterator.php
lib/index_bundle_iterators/index_bundle_iterator.php
lib/index_bundle_iterators/intersect_iterator.php
lib/index_bundle_iterators/phrase_filter_iterator.php
lib/index_bundle_iterators/union_iterator.php
lib/index_bundle_iterators/word_iterator.php
lib/index_shard.php
lib/processors/html_processor.php
lib/string_array.php
lib/utility.php
models/phrase_model.php
views/search_view.php

diff --git a/bin/fetcher.php b/bin/fetcher.php
index 75a2c2624..275845fb8 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -72,6 +72,8 @@ require_once BASE_DIR."/lib/crawl_daemon.php";
 require_once BASE_DIR."/lib/fetch_url.php";
 /** Loads common constants for web crawling*/
 require_once BASE_DIR."/lib/crawl_constants.php";
+/** used to build miniinverted index*/
+require_once BASE_DIR."/lib/index_shard.php";


 /*
@@ -1033,231 +1035,56 @@ class Fetcher implements CrawlConstants
     }

     /**
-     * Builds an inverted index (word --> {docs it appears in}) for the current
-     * batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages. This inverted
-     * is then merged by the queue_server into the inverted index of the
-     * current generation of the crawl. The complete inverted index for the
-     * whole crawl is built out of these inverted indexes for generations.
-     * The point of computing a partial inverted index on the fetcher is to
-     * reduce some of the computational burden on the queue server. The
-     * resulting mini index computed by buildMiniInvertedIndex() is stored in
+     * Builds an inverted index shard (word --> {docs it appears in})
+     * for the current batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages.
+     * This inverted index shard is then merged by the queue_server
+     * into the inverted index of the current generation of the crawl.
+     * The complete inverted index for the whole crawl is built out of these
+     * inverted indexes for generations. The point of computing a partial
+     * inverted index on the fetcher is to reduce some of the computational
+     * burden on the queue server. The resulting mini index computed by
+     * buildMiniInvertedIndex() is stored in
      * $this->found_sites[self::INVERTED_INDEX]
      *
      */
     function buildMiniInvertedIndex()
     {
         $start_time = microtime();
-        $words = array();
-        $doc_statistics = $this->computeDocumentStatistics();
-        $average_title_length = $doc_statistics[self::AVERAGE_TITLE_LENGTH];
-        $average_description_length =
-            $doc_statistics[self::AVERAGE_DESCRIPTION_LENGTH];
-        $average_total_link_text_length =
-            $doc_statistics[self::AVERAGE_TOTAL_LINK_TEXT_LENGTH];
-
-        $special_case_fields = array(self::INLINKS, self::SITE_INFO,
-            self::FILETYPE, self::URL_INFO);
-
-        foreach($doc_statistics as $doc_key => $info) {
-            if(in_array($doc_key, $special_case_fields)) {continue;}
-            $title_length = $info[self::TITLE_LENGTH];
-            $description_length = $info[self::DESCRIPTION_LENGTH];
-            $link_length = $info[self::LINK_LENGTH];
-
-            $title_ratio = ($average_title_length > 0) ?
-                $title_length/$average_title_length : 0;
-            $description_ratio = ($average_description_length > 0) ?
-                $description_length/$average_description_length :0;
-            $link_ratio = ($average_total_link_text_length > 0) ?
-                $link_length/$average_total_link_text_length : 0;
-
-            if(isset($info[self::TITLE_WORDS])) {
-                foreach($info[self::TITLE_WORDS]
-                    as $word_key => $num_occurrences) {
-                    $title_frequency = $num_occurrences/$title_length;
-
-                    $words[crawlHash($word_key)][$doc_key][
-                        self::TITLE_WORD_SCORE] =
-                            number_format(3 * $title_frequency/
-                                ($title_frequency + .5 + 1.5* $title_ratio),
-                                PRECISION);
-                    $words[crawlHash($word_key)][$doc_key][
-                        self::DESCRIPTION_WORD_SCORE] = 0;
-                        // will set in a moment if has value
-                    $words[crawlHash($word_key)][$doc_key][
-                        self::LINK_WORD_SCORE] = 0;
-                }
-            }
-
-            if(isset($info[self::DESCRIPTION_WORDS])) {
-                foreach($info[self::DESCRIPTION_WORDS]
-                    as $word_key => $num_occurrences) {
-                    $description_frequency =
-                        $num_occurrences/$description_length;
-
-                    $words[crawlHash($word_key)][$doc_key][
-                        self::DESCRIPTION_WORD_SCORE] =
-                            number_format(3 * $description_frequency/
-                                ($description_frequency
-                                + .5 + 1.5* $description_ratio), PRECISION);
-
-                    if(!isset($words[crawlHash($word_key)][$doc_key][
-                        self::TITLE_WORD_SCORE])) {
-                            $words[crawlHash($word_key)][$doc_key][
-                                self::TITLE_WORD_SCORE] = 0;
-                    }
-
-                    $words[crawlHash($word_key)][$doc_key][
-                        self::LINK_WORD_SCORE] = 0;
-                }
-            }
-
-            if(isset($info[self::LINK_WORDS])) {
-                foreach($info[self::LINK_WORDS]
-                    as $word_key => $num_occurrences) {
-                    $link_frequency = $num_occurrences/$link_length;
-
-                    $words[crawlHash($word_key)][$doc_key][
-                        self::LINK_WORD_SCORE] = number_format(
-                            3 * $link_frequency/
-                            ($link_frequency + .5 + 1.5* $link_ratio),
-                            PRECISION);
-
-                    if(!isset($words[crawlHash($word_key)][$doc_key][
-                        self::TITLE_WORD_SCORE])) {
-                        $words[crawlHash($word_key)][$doc_key][
-                            self::TITLE_WORD_SCORE] = 0;
-                    }
-
-                    if(!isset($words[crawlHash($word_key)][$doc_key][
-                        self::DESCRIPTION_WORD_SCORE])) {
-                        $words[crawlHash($word_key)][$doc_key][
-                            self::DESCRIPTION_WORD_SCORE] = 0;
-                    }
-                }
-            }
-
-        } // end foreach
-
-        foreach($words as $word_key => $docs_info) {
-            foreach($docs_info as $doc_key => $info) {
-                $doc_depth =  $doc_statistics[$doc_key][self::DOC_DEPTH];
-                $doc_rank = (11 - $doc_depth) +
-                    $doc_statistics[$doc_key][self::URL_WEIGHT];
-                $words[$word_key][$doc_key][self::DOC_RANK] =
-                    number_format($doc_rank, PRECISION); //proxy for page rank
-
-                $orphan = (isset($info[self::LINK_WORDS]) &&
-                    $info[self::LINK_WORDS] == true) ? 1 : .5;
-
-                $words[$word_key][$doc_key][self::SCORE] = number_format(
-                    .8*($doc_rank)
-                    + $info[self::TITLE_WORD_SCORE]
-                    + 2*$info[self::DESCRIPTION_WORD_SCORE]*$orphan
-                    + 1.5*$info[self::LINK_WORD_SCORE],  PRECISION);
-
-            }
-        }
-
-
-        //add word_keys for inlink, sites, filetype
-        foreach($special_case_fields as $special_case_field) {
-            if(isset($doc_statistics[$special_case_field])) {
-            foreach($doc_statistics[$special_case_field]
-                as $url_word_key => $docs_info) {
-                foreach($docs_info as $doc_key) {
-                    $doc_depth = $doc_statistics[$doc_key][self::DOC_DEPTH] + 1;
-                    $words[$url_word_key][$doc_key][self::TITLE_WORD_SCORE] = 0;
-                    $words[$url_word_key][$doc_key][
-                        self::DESCRIPTION_WORD_SCORE] = 0;
-                    $words[$url_word_key][$doc_key][self::LINK_WORD_SCORE] = 0;
-                    $words[$url_word_key][$doc_key][self::DOC_RANK] =
-                        number_format(11 - $doc_depth, PRECISION);
-                    $words[$url_word_key][$doc_key][self::SCORE] =
-                        number_format(11 - $doc_depth, PRECISION);
-                }
-
-            }
-            }
-        }
-        foreach($this->found_duplicates as $duplicate) {
-            $doc_key = crawlHash($duplicate);
-            $url_word_key = crawlHash("info:".$duplicate);
-            $words[$url_word_key][$doc_key][self::TITLE_WORD_SCORE] = -1;
-            $words[$url_word_key][$doc_key][self::DESCRIPTION_WORD_SCORE] = -1;
-            $words[$url_word_key][$doc_key][self::LINK_WORD_SCORE] = -1;
-            $words[$url_word_key][$doc_key][self::DOC_RANK] = -1;
-            $words[$url_word_key][$doc_key][self::SCORE] = -1;
-        }
-        $this->found_duplicates = array();
-
-        $this->found_sites[self::INVERTED_INDEX] = $words;
-
-        crawlLog("  Build mini inverted index time ".
-            (changeInMicrotime($start_time)));
-    }
-

-    /**
-     * Used to compute number of words in each component (title, description,
-     * links) of a document separately as well as compute average amongst the
-     * current group of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many docs.
-     *
-     * @return array computed statistics
-     */
-    function computeDocumentStatistics()
-    {
-        $doc_statistics = array();
         $num_seen = count($this->found_sites[self::SEEN_URLS]);
         $this->num_seen_sites += $num_seen;
+        /*
+            for the fetcher we are not saving the index shards so
+            name doesn't matter.
+        */
+        $index_shard = new IndexShard("fetcher_shard");
         for($i = 0; $i < $num_seen; $i++) {
             $site = $this->found_sites[self::SEEN_URLS][$i];
-            $doc_key = crawlHash($site[self::URL]);
-
-            $doc_statistics[$doc_key][self::URL_WEIGHT] =
-                3 - log(strlen($site[self::URL])); //negative except short urls
-            $doc_statistics[$doc_key][self::DOC_DEPTH] =
-                log($site[self::INDEX]*NUM_FETCHERS, 10);
-            $title_phrase_string =
-                mb_ereg_replace("[[:punct:]]", " ", $site[self::TITLE]);
-            $doc_statistics[$doc_key][self::TITLE_WORDS] =
-                PhraseParser::extractPhrasesAndCount($title_phrase_string);
-            $doc_statistics[$doc_key][self::TITLE_LENGTH] =
-                $this->sumCountArray(
-                    $doc_statistics[$doc_key][self::TITLE_WORDS]);
-            $this->sum_seen_site_title_length +=
-                $doc_statistics[$doc_key][self::TITLE_LENGTH];
-
-            $description_phrase_string =
-                mb_ereg_replace("[[:punct:]]", " ", $site[self::DESCRIPTION]);
-            $doc_statistics[$doc_key][self::DESCRIPTION_WORDS] =
-                PhraseParser::extractPhrasesAndCount(
-                    $description_phrase_string);
-            $doc_statistics[$doc_key][self::DESCRIPTION_LENGTH] =
-                $this->sumCountArray(
-                    $doc_statistics[$doc_key][self::DESCRIPTION_WORDS]);
-            $this->sum_seen_site_description_length +=
-                $doc_statistics[$doc_key][self::DESCRIPTION_LENGTH];
-            $doc_statistics[$doc_key][self::LINK_WORDS] = array();
-            $doc_statistics[$doc_key][self::LINK_LENGTH] = 0;
+            $doc_key = crawlHash($site[self::URL], true);
+            $word_counts = array();
+            $phrase_string =
+                mb_ereg_replace("[[:punct:]]", " ", $site[self::TITLE] .
+                   " ". $site[self::DESCRIPTION]);
+            $word_counts =
+                PhraseParser::extractPhrasesAndCount($phrase_string);
+
+            $meta_ids = array();
+
             // store the sites the doc_key belongs to, so you can search by site
             $url_sites = UrlParser::getHostPaths($site[self::URL]);
             $url_sites = array_merge($url_sites,
                 UrlParser::getHostSubdomains($site[self::URL]));
             foreach($url_sites as $url_site) {
                 if(strlen($url_site) > 0) {
-                    $doc_statistics[self::SITE_INFO][
-                        crawlHash('site:'.$url_site)][] = $doc_key;
+                    $meta_ids[] = 'site:'.$url_site;
                 }
             }
-            $doc_statistics[self::URL_INFO][
-                crawlHash('info:'.$site[self::URL])][] = $doc_key;
+            $meta_ids[] = 'info:'.$site[self::URL];

             // store the filetype info
             $url_type = UrlParser::getDocumentType($site[self::URL]);
             if(strlen($url_type) > 0) {
-                $doc_statistics[self::FILETYPE][
-                    crawlHash('filetype:'.$url_type)][] = $doc_key;
+                $meta_ids[] = 'filetype:'.$url_type;
             }

             $link_phrase_string = "";
@@ -1271,6 +1098,7 @@ class Fetcher implements CrawlConstants
             }
             $had_links = false;

+            $link_shard = new IndexShard("link_shard");
             foreach($site[self::LINKS] as $url => $link_text) {
                 if(strlen($url) > 0) {
                     $summary = array();
@@ -1278,8 +1106,9 @@ class Fetcher implements CrawlConstants
                     $link_text = strip_tags($link_text);
                     $link_id =
                         "url|".$url."|text|$link_text|ref|".$site[self::URL];
-                    $link_key =  crawlHash($link_id).":".crawlHash($url).":"
-                        .crawlHash("info:".$url);
+                    $link_key =  crawlHash($link_id, true).":".
+                        crawlHash($url, true).":"
+                        .crawlHash("info:".$url, "true");
                     $summary[self::URL] =  $link_id;
                     $summary[self::TITLE] = $url;
                         // stripping html to be on the safe side
@@ -1289,72 +1118,33 @@ class Fetcher implements CrawlConstants
                     $summary[self::HASH] =  crawlHash($link_id);
                     $summary[self::TYPE] = "link";
                     $summary[self::HTTP_CODE] = "link";
-                    $summary[self::WEIGHT] =  $link_weight;
                     $this->found_sites[self::SEEN_URLS][] = $summary;
-
-                    $doc_statistics[$link_key][self::URL_WEIGHT] =
-                        3 - log(strlen($url));
-                        //negative except short urls
-                    $doc_statistics[$link_key][self::TITLE_WORDS]  =array();
-                    $doc_statistics[$link_key][self::TITLE_LENGTH] = 0;
-                    $doc_statistics[$link_key][self::DESCRIPTION_WORDS] =
-                        array();
-                    $doc_statistics[$link_key][self::DESCRIPTION_LENGTH] = 0;

                     $link_text =
                         mb_ereg_replace("[[:punct:]]", " ", $link_text);
-                    $doc_statistics[$link_key][self::LINK_WORDS] =
+                    $link_word_counts =
                         PhraseParser::extractPhrasesAndCount($link_text);
-                    $doc_statistics[$link_key][self::LINK_LENGTH] =
-                        $this->sumCountArray(
-                            $doc_statistics[$link_key][self::LINK_WORDS]);
-                    $this->sum_seen_site_link_length +=
-                        $doc_statistics[$link_key][self::LINK_LENGTH];
-
-                    $doc_statistics[$link_key][self::DOC_DEPTH] =
-                        log(10*$site[self::INDEX]*NUM_FETCHERS, 10);
-                        //our proxy for page rank, 10=average links/page
-                    $doc_statistics[self::INLINKS][crawlHash('link:'.$url)][] =
-                        $doc_key;
+                    $link_shard->addDocumentWords($link_key, 0,
+                        $link_word_counts, array());
+
+                    $meta_ids[] = 'link:'.$url;
                 }
-                $this->found_sites[self::SEEN_URLS][$i][self::LINKS] =
-                    $had_links;
-            }

+            }
+            $index_shard->addDocumentWords($doc_key, 0, $word_counts,
+                $meta_ids);
+
+            $index_shard->appendIndexShard($link_shard);

         }
+        $index_shard->markDuplicateDocs($this->found_duplicates);

-        $doc_statistics[self::AVERAGE_TITLE_LENGTH] =
-            $this->sum_seen_site_title_length/$this->num_seen_sites;
-
-        $doc_statistics[self::AVERAGE_DESCRIPTION_LENGTH] =
-            $this->sum_seen_site_description_length/$this->num_seen_sites;
-
-        $doc_statistics[self::AVERAGE_TOTAL_LINK_TEXT_LENGTH] =
-            $this->sum_seen_site_link_length/$this->num_seen_sites;
-
-        crawlLog("AVERAGE TITLE LENGTH".
-            $doc_statistics[self::AVERAGE_TITLE_LENGTH]);
-        crawlLog("AVERAGE DESCRIPTION LENGTH".
-            $doc_statistics[self::AVERAGE_DESCRIPTION_LENGTH]);
-        crawlLog("AVERAGE TOTAL LINK TEXT LENGTH".
-            $doc_statistics[self::AVERAGE_TOTAL_LINK_TEXT_LENGTH]);
-        return $doc_statistics;
-    }
+        $this->found_duplicates = array();

-    /**
-     * Computes a sum of the values of an associative array of key-value pairs
-     *
-     * @param array &$arr the associative array to compute the sum of
-     */
-    function sumCountArray(&$arr)
-    {
-        $sum = 0;
-        foreach($arr as $key => $value) {
-            $sum += $value;
-        }
+        $this->found_sites[self::INVERTED_INDEX] = & $index_shard;

-        return $sum;
+        crawlLog("  Build mini inverted index time ".
+            (changeInMicrotime($start_time)));
     }
 }

diff --git a/bin/queue_server.php b/bin/queue_server.php
index 8677edf76..d190d1fdf 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -383,7 +383,7 @@ class QueueServer implements CrawlConstants
                 CRAWL_DIR.'/cache/'.
                     self::index_data_base_name.$this->crawl_time,
                 URL_FILTER_SIZE, NUM_ARCHIVE_PARTITIONS,
-                NUM_INDEX_PARTITIONS, serialize($info));
+                serialize($info));
         } else {
             $dir = CRAWL_DIR.'/cache/'.
                     self::index_data_base_name.$this->crawl_time;
@@ -505,7 +505,6 @@ class QueueServer implements CrawlConstants

         $start_time = microtime();

-        $index_archive = $this->index_archive;
         $fh = fopen($file, "rb");
         $machine_string = fgets($fh);
         $len = strlen($machine_string);
@@ -526,7 +525,8 @@ class QueueServer implements CrawlConstants
         if(isset($sites[self::SEEN_URLS]) &&
             count($sites[self::SEEN_URLS]) > 0) {
             $seen_sites = $sites[self::SEEN_URLS];
-            $index_archive->differenceContainsPages($seen_sites, self::HASH);
+            $this->index_archive->differenceContainsPages(
+                $seen_sites, self::HASH);
             $seen_sites = array_values($seen_sites);
             $num_seen = count($seen_sites);
         } else {
@@ -535,18 +535,18 @@ class QueueServer implements CrawlConstants

         $visited_urls_count = 0;
         for($i = 0; $i < $num_seen; $i++) {
-            $index_archive->addPageFilter(self::HASH, $seen_sites[$i]);
+            $this->index_archive->addPageFilter(self::HASH, $seen_sites[$i]);
             $seen_sites[$i][self::MACHINE] = $machine;
             $seen_sites[$i][self::MACHINE_URI] = $machine_uri;
             $seen_sites[$i][self::HASH_URL] =
-                crawlHash($seen_sites[$i][self::URL]);
+                crawlHash($seen_sites[$i][self::URL], true);
             $link_url_parts = explode("|", $seen_sites[$i][self::URL]);
             if(strcmp("url", $link_url_parts[0]) == 0 &&
                 strcmp("text", $link_url_parts[2]) == 0) {
                 $seen_sites[$i][self::HASH_URL] =
-                    crawlHash($seen_sites[$i][self::URL]).
-                    ":".crawlHash($link_url_parts[1]).
-                    ":".crawlHash("info:".$link_url_parts[1]);
+                    crawlHash($seen_sites[$i][self::URL], true).
+                    ":".crawlHash($link_url_parts[1],true).
+                    ":".crawlHash("info:".$link_url_parts[1], true);
             } else {
                 $visited_urls_count++;
             }
@@ -554,7 +554,7 @@ class QueueServer implements CrawlConstants

         if(isset($seen_sites)) {
             $seen_sites =
-                $index_archive->addPages(
+                $this->index_archive->addPages(
                     self::HASH_URL, self::SUMMARY_OFFSET, $seen_sites,
                     $visited_urls_count);

@@ -563,38 +563,25 @@ class QueueServer implements CrawlConstants
                 $summary_offsets[$site[self::HASH_URL]] =
                     $site[self::SUMMARY_OFFSET];
             }
-            crawlLog("B memory usage".memory_get_usage() .
+            crawlLog("B (dedup + random) memory usage".memory_get_usage() .
                 " time: ".(changeInMicrotime($start_time)));
             $start_time = microtime();
             // added summary offset info to inverted index data
-            if(isset($sites[self::INVERTED_INDEX])) {
-                $index_data = & $sites[self::INVERTED_INDEX];
-                foreach( $index_data as $word_key => $docs_info) {
-                    foreach($docs_info as $doc_key => $info) {
-                        if(isset($summary_offsets[$doc_key])) {
-                            $index_data[$word_key][$doc_key][
-                                self::SUMMARY_OFFSET] =
-                                    $summary_offsets[$doc_key];
-                        }
-                    }
-                }
+            if(isset($sites[self::INVERTED_INDEX])) {
+                $index_shard = & $sites[self::INVERTED_INDEX];
+                $index_shard->changeDocumentOffsets($summary_offsets);
             }
         }
-        crawlLog("C memory usage".memory_get_usage() .
-            " time: ".(changeInMicrotime($start_time)));
-        $start_time = microtime();
-        $index_archive->forceSave();
-        crawlLog("D memory usage".memory_get_usage() .
+        crawlLog("C (update shard offsets) memory usage".memory_get_usage() .
             " time: ".(changeInMicrotime($start_time)));
         $start_time = microtime();

-        if(isset($index_data)) {
-            $index_archive->addIndexData($index_data);
+        if(isset($index_shard)) {
+            $this->index_archive->addIndexData($index_shard);
         }
-        crawlLog("E memory usage".memory_get_usage().
+        crawlLog("D (add index shard) memory usage".memory_get_usage().
             " time: ".(changeInMicrotime($start_time)));

-
         crawlLog("Done Processing File: $file");

         unlink($file);
diff --git a/configs/config.php b/configs/config.php
index ac1b4768a..6a011ff87 100755
--- a/configs/config.php
+++ b/configs/config.php
@@ -152,14 +152,8 @@ define('MIN_QUEUE_WEIGHT', 1/100000);
 /**  number of web archive files to use to store web pages in */
 define('NUM_ARCHIVE_PARTITIONS', 10);

-/**
- * number of web archive files to use for the inverted index of
- * word->docs in a given generation
- */
-define('NUM_INDEX_PARTITIONS', 250);
-
-/** number of words before next gen */
-define('NUM_WORDS_PER_GENERATION', 6*URL_FILTER_SIZE/NUM_INDEX_PARTITIONS);
+/** number of documents before next gen */
+define('NUM_DOCS_PER_GENERATION', 10000);

 /** number of generations to sample in estimating number of urls in a query */
 define('SAMPLE_GENERATIONS', 3);
@@ -167,15 +161,6 @@ define('SAMPLE_GENERATIONS', 3);
 /** precision to round floating points document scores */
 define('PRECISION', 10);

-/**
- * when index data from relatively uncommon words,
- * how many docs should be grouped together in a block
- */
-define('BLOCK_SIZE', 50);
-
-/** how many documents a word needs to be to get its own index file. */
-define('COMMON_WORD_THRESHOLD', 1000);
-
 /** maximum number of links to consider on any given page */
 define('MAX_LINKS_PER_PAGE', 50);

@@ -263,7 +248,7 @@ $PAGE_PROCESSORS = array(   "text/html" => "HtmlProcessor",
  * How many non robot urls the fetcher successfully downloads before
  * between times data sent back to queue server
  */
-define ('SEEN_URLS_BEFORE_UPDATE_SCHEDULER', 400);
+define ('SEEN_URLS_BEFORE_UPDATE_SCHEDULER', 500);

 /** maximum number of urls to schedule to a given fetcher in one go */
 define ('MAX_FETCH_SIZE', 5000);
diff --git a/controllers/search_controller.php b/controllers/search_controller.php
index 8b027f373..716ab82ee 100755
--- a/controllers/search_controller.php
+++ b/controllers/search_controller.php
@@ -231,7 +231,7 @@ class SearchController extends Controller implements CrawlConstants
                         $this->phraseModel->lookupSummaryOffset($url);
                 }
                 $crawl_item = $this->crawlModel->getCrawlItem(
-                    crawlHash($url), $summary_offset);
+                    crawlHash($url, true), $summary_offset);

                 $top_phrases  =
                     $this->phraseModel->getTopPhrases($crawl_item, 3);
@@ -332,7 +332,7 @@ class SearchController extends Controller implements CrawlConstants
             $summary_offset = $this->phraseModel->lookupSummaryOffset($url);
         }

-        if(!$crawl_item = $this->crawlModel->getCrawlItem(crawlHash($url),
+        if(!$crawl_item = $this->crawlModel->getCrawlItem(crawlHash($url, true),
             $summary_offset)) {

             $this->displayView("nocache", $data);
diff --git a/lib/bloom_filter_bundle.php b/lib/bloom_filter_bundle.php
index 6962e7b34..4615adc85 100644
--- a/lib/bloom_filter_bundle.php
+++ b/lib/bloom_filter_bundle.php
@@ -102,7 +102,7 @@ class BloomFilterBundle
             $this->num_filters++;
             $this->filter_size = $filter_size;
             $this->current_filter->save();
-           $this->saveMetaData();
+            $this->saveMetaData();
         } else {
             $last_filter = $this->num_filters - 1;
             $this->current_filter =
@@ -132,12 +132,13 @@ class BloomFilterBundle
                     $this->filter_size);
             $this->current_filter_count = 0;
             $this->num_filters++;
+            $this->saveMetaData();
         }

         $this->current_filter->add($value);

         $this->current_filter_count++;
-        $this->saveMetaData();
+
     }

     /**
diff --git a/lib/bloom_filter_file.php b/lib/bloom_filter_file.php
index 77c494d87..dcc4c12ec 100755
--- a/lib/bloom_filter_file.php
+++ b/lib/bloom_filter_file.php
@@ -102,9 +102,9 @@ class BloomFilterFile extends PersistentStructure
     function add($value)
     {
         $num_keys = $this->num_keys;
+        $pos_array = $this->getHashBitPositionArray($value, $num_keys);
         for($i = 0;  $i < $num_keys; $i++) {
-            $pos = $this->getHashBitPosition($value.$i);
-            $this->setBit($pos);
+            $this->setBit($pos_array[$i]);
         }

         $this->checkSave();
@@ -119,10 +119,9 @@ class BloomFilterFile extends PersistentStructure
     function contains($value)
     {
         $num_keys = $this->num_keys;
+        $pos_array = $this->getHashBitPositionArray($value, $num_keys);
         for($i = 0;  $i < $num_keys; $i++) {
-            $pos = $this->getHashBitPosition($value.$i);
-
-            if(!$this->getBit($pos)) {
+            if(!$this->getBit($pos_array[$i])) {
                 return false;
             }
         }
@@ -136,15 +135,31 @@ class BloomFilterFile extends PersistentStructure
      * @param string $value value to map to a bit position in the filter
      * @return int the bit position mapped to
      */
-    function getHashBitPosition($value)
+    function getHashBitPositionArray($value, $num_keys)
     {
-        $hash = substr(md5($value, true), 0, 4);
-        $int_array = unpack("N", $hash);
-        $seed = $int_array[1];
+        $md5 = md5($value, true);
+        $seed = array();
+        for($i = 0; $i < 16; $i += 4) {
+            $hash = substr($md5, $i, 4);
+            $int_array = unpack("N", $hash);
+            $seed[] = $int_array[1];
+        }

-        mt_srand($seed);
-        $pos = mt_rand(0, $this->filter_size -1);
-        return $pos;
+        //$pos_array = array_fill(0, $num_keys, 0);
+        $pos_array = array();
+        $offset = $num_keys >> 2;
+        $size = $this->filter_size - 1;
+        $index = 0;
+        for($j = 0; $j < $num_keys; $j += $offset) {
+            $high = $j + $offset;
+            if($index < 4) {
+                mt_srand($seed[$index++]);
+            }
+            for($i = $j; $i < $high; $i++) {
+                $pos_array[$i] = mt_rand(0, $size);
+            }
+        }
+        return $pos_array;
     }

     /**
@@ -154,7 +169,7 @@ class BloomFilterFile extends PersistentStructure
      */
     function setBit($i)
     {
-        $byte = ($i >> 3);;
+        $byte = ($i >> 3);

         $bit_in_byte = $i - ($byte << 3);

diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php
index 12a98b732..b836591ea 100644
--- a/lib/crawl_constants.php
+++ b/lib/crawl_constants.php
@@ -131,5 +131,8 @@ interface CrawlConstants
     const HASH_SEEN_URLS ='aj';
     const RECENT_URLS ='ak';
     const MEMORY_USAGE ='al';
+    const DOC_ID ='am';
+    const RELEVANCE ='an';
+    const DUPLICATE ='ao';
 }
 ?>
diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php
index 6ce39b8da..2fa1bef34 100644
--- a/lib/index_archive_bundle.php
+++ b/lib/index_archive_bundle.php
@@ -38,9 +38,9 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
  */
 require_once 'web_archive_bundle.php';
 /**
- * Bloom Filter used by BloomFilterBundle
+ * Used to store word index
  */
-require_once 'bloom_filter_file.php';
+require_once 'index_shard.php';
 /**
  * Used to check if a page already stored in the WebArchiveBundle
  */
@@ -60,46 +60,6 @@ require_once 'crawl_constants.php';
 require_once 'indexing_constants.php';


-/**
- * Callback function used to set the offsets into the archive file from
- * the particular word info in the header block of a WordArchive
- *
- * @param array $data
- * @param array $objects
- * @param string $offset_field
- */
-function setOffsetPointers($data, &$objects, $offset_field)
-{
-    $count = count($objects);
-
-    for($i = 0 ; $i < $count ; $i++ ) {
-        if(isset($objects[$i][$offset_field]) ) {
-            $offset = $objects[$i][$offset_field];
-            foreach($objects[$i] as $word_key_and_block_num => $docs_info) {
-                $tmp = explode(":", $word_key_and_block_num);
-                if(isset($tmp[1]) ) {
-                    list($word_key, $block_num) = $tmp;
-                    if(strcmp($word_key, "offset") != 0) {
-                        if(($block_num +1)*BLOCK_SIZE <
-                            COMMON_WORD_THRESHOLD) {
-                            $data[$word_key][$block_num] = $offset;
-                        } else if(isset(
-                            $docs_info[IndexingConstants::POINT_BLOCK])) {
-                            $data[$word_key][IndexingConstants::LIST_OFFSET] =
-                                $offset;
-                        }
-                    }
-                }
-            }
-
-        }
-    }
-
-    return $data;
-}
-
-
-
 /**
  * Encapsulates a set of web page summaries and an inverted word-index of terms
  * from these summaries which allow one to search for summaries containing a
@@ -108,34 +68,12 @@ function setOffsetPointers($data, &$objects, $offset_field)
  * The basic file structures for an IndexArchiveBundle are:
  * <ol>
  * <li>A WebArchiveBundle for web page summaries.</li>
- * <li>A set of WebArchiveBundles for the inverted index. Each such bundle
- * is called a <b>generation</b>. These bundles have name index0, index1,...
- * The file generations.txt keeps track of what is the current generation
- * and how many words have been stored in it. A given generation can
- * hold NUM_WORDS_PER_GENERATION words amongst all its partitions. After which
- * the next generation begins. In a given generation, a word is stored in
- * the partition that its hash key hashes to. The same word may appear in
- * several generations. The info block for a partition for a particular
- * generation contains objects for each word of the generation that hashed
- * to that partition. Each such word object contains a count of the number
- * of documents it occurred in for that generation. It also has an
- * array of block_pointers to blocks of size BLOCK_SIZE. These blocks contains
- * documents that the word occurred in, the score for the occurrence, and
- * an offset into the summary file for that document. If the total number of
- * documents is not a multiple of BLOCK_SIZE the remaining documents are stored
- * directly in the word's info block object. If, in a given generation, a
- * word occurs more than COMMON_WORD_THRESHOLD many times then the word object
- * uses a LIST_OFFSET pointer to point to a linked list in the partition of
- * addtional blocks of documents for that word.
+ * <li>A set of inverted index generations. These generations
+ *  have name index0, index1,...
+ * The file generations.txt keeps track of what is the current generation.
+ * A given generation can hold NUM_WORDS_PER_GENERATION words amongst all
+ * its partitions. After which the next generation begins.
  * </li>
- * <li>For each partition and for all generations a BloomFilterFile is used
- * to keep track of which words appear in which generations for a
- * particular partition. These filters are stored in a folder within the
- * IndexArchiveBundle called index_filters. When a word and documents
- * containing it are stored in an IndexArchiveBundle, its word_key (its has) is
- * stored in the filter for the partition its word_key hash to. Further
- * if the current generation is i, then work_ket concatenated with i is
- * also stored in this same filter.</li>
  * </ol>
  *
  *
@@ -145,12 +83,7 @@ function setOffsetPointers($data, &$objects, $offset_field)
  */
 class IndexArchiveBundle implements IndexingConstants, CrawlConstants
 {
-    /**
-     * Used to keep track of the time to perform various operations
-     * in this IndexArchiveBundle
-     * @var array
-     */
-    var $diagnostics;
+
     /**
      * Folder name to use for this IndexArchiveBundle
      * @var string
@@ -166,12 +99,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
      * @int
      */
     var $num_partitions_summaries;
-    /**
-     * Number of partitions in the inverted word index
-     * (same for each generation)
-     * @int
-     */
-    var $num_partitions_index;
+
     /**
      * structure contains info about the current generation:
      * its index (ACTIVE), and the number of words it contains
@@ -180,26 +108,20 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
      */
     var $generation_info;
     /**
-     * Number of words before a new generation is started
+     * Number of docs before a new generation is started
      * @int
      */
-    var $num_words_per_generation;
+    var $num_docs_per_generation;
     /**
      * WebArchiveBundle for web page summaries
      * @object
      */
     var $summaries;
     /**
-     * WebArchiveBundle for inverted word index
+     * Index Shard for current generation inverted word index
      * @object
      */
-    var $index;
-    /**
-     * Bloom Filters used to figure out which words are in which generations for
-     * given paritions
-     * @object
-     */
-    var $index_partition_filters;
+    var $current_shard;

     /**
      * Makes or initializes an IndexArchiveBundle with the provided parameters
@@ -216,8 +138,8 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
      * IndexArchiveBundle
      */
     public function __construct($dir_name, $filter_size = -1,
-        $num_partitions_summaries = NULL, $num_partitions_index = NULL,
-        $description = NULL)
+        $num_partitions_summaries = NULL, $description = NULL,
+        $num_docs_per_generation = NUM_DOCS_PER_GENERATION)
     {

         $this->dir_name = $dir_name;
@@ -225,7 +147,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

         if(!is_dir($this->dir_name)) {
             mkdir($this->dir_name);
-            mkdir($this->dir_name."/index_filters");
+
         } else {
             $index_archive_exists = true;

@@ -236,7 +158,6 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
                 file_get_contents($this->dir_name."/generation.txt"));
         } else {
             $this->generation_info['ACTIVE'] = 0;
-            $this->generation_info['NUM_WORDS'] = 0;
             file_put_contents($this->dir_name."/generation.txt",
                 serialize($this->generation_info));
         }
@@ -246,13 +167,9 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

         $this->num_partitions_summaries = $this->summaries->num_partitions;

-        $this->index = new WebArchiveBundle(
-            $dir_name."/index".$this->generation_info['ACTIVE'], -1,
-            $num_partitions_index);
-        $this->num_partitions_index = $this->index->num_partitions;
         $this->description = $this->summaries->description;

-        $this->num_words_per_generation = NUM_WORDS_PER_GENERATION;
+        $this->num_docs_per_generation = $num_docs_per_generation;

     }

@@ -283,228 +200,98 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
      * @param array $index_data a mini inverted index of word_key=>doc data
      *      to add to this IndexArchiveBundle
      */
-    public function addIndexData($index_data)
+    public function addIndexData($index_shard)
     {

-        $out_data = array();
-
-        if(!count($index_data) > 0) return;
-
-        /* Arrange the words according to the partitions they are in
-         */
-
-        $this->diagnostics['SELECT_TIME'] = 0;
-        $this->diagnostics['INFO_BLOCKS_TIME'] = 0;
-        $this->diagnostics['ADD_FILTER_TIME'] = 0;
-        $this->diagnostics['ADD_OBJECTS_TIME'] = 0;
+        crawlLog("**ADD INDEX DIAGNOSTIC INFO...");
         $start_time = microtime();
-        foreach($index_data as $word_key => $docs_info) {
-
-            $partition = WebArchiveBundle::selectPartition(
-                 $word_key, $this->num_partitions_index);
-            $out_data[$partition][$word_key] = $docs_info;
-
-        }
-        $this->diagnostics['SELECT_TIME'] += changeInMicrotime($start_time);
-
-        /* for each partition add the word data for the partition to the
-           partition web archive
-         */
-        $cnt = 0;
-        foreach($out_data as $partition => $word_data) {
-            $this->addPartitionWordData($partition, $word_data);
-            $cnt++;
+        $current_num_docs = $this->getActiveShard()->num_docs;
+        $add_num_docs = $index_shard->num_docs;
+        if($current_num_docs + $add_num_docs > $this->num_docs_per_generation){
+            $switch_time = microtime();
+            $this->forceSave();
+            $this->generation_info['ACTIVE']++;
+            $this->generation_info['CURRENT'] =
+                $this->generation_info['ACTIVE'];
+            $current_index_shard_file = $this->dir_name."/index".
+                $this->generation_info['ACTIVE'];
+            $this->current_shard = new IndexShard(
+                $current_index_shard_file, $this->generation_info['ACTIVE'] *
+                    $this->num_docs_per_generation);
+            file_put_contents($this->dir_name."/generation.txt",
+                serialize($this->generation_info));
+            crawlLog("Switch Shard time:".changeInMicrotime($switch_time));
         }
-        file_put_contents($this->dir_name."/generation.txt",
-            serialize($this->generation_info));
-        $out_data = NULL;
-        gc_collect_cycles();
-
-        crawlLog("**ADD INDEX DIAGNOSTIC INFO...");
-        crawlLog("**Time calculating select partition functions ".
-            $this->diagnostics['SELECT_TIME']);
-        crawlLog("**Time reading info blocks ".
-            $this->diagnostics['INFO_BLOCKS_TIME']);
-        crawlLog("**Time adding objects to index ".
-            $this->diagnostics['ADD_OBJECTS_TIME']);
-        crawlLog("**Time adding to filters ".
-            $this->diagnostics['ADD_FILTER_TIME']);
-        crawlLog("**Number of partitions ".$cnt);
-
+        $this->getActiveShard()->appendIndexShard($index_shard);
+        crawlLog("Append Index Shard: Memory usage:".memory_get_usage() .
+          " Time: ".(changeInMicrotime($start_time)));
     }

     /**
-     * Adds the mini-inverted index data that to a particular partition.
-     * It is assume the word keys in this data would hash to the destined
-     * index partitions
-     *
-     * @param int $partition WebArchive in the index WebArchiveBundle of the
-     *      current generation to write to
-     * @param array &$word_data what to wrtie
-     * @param bool $overwrite whether to signal that all data in prior
-     * generations associated with keys that are being inserted should be
-     * ignored (for instance, multi-word search are partially computed and
-     * added to the index. If these get recomputed we might want to ignore
-     * prior work. )
+     * Sets the current shard to be the active shard (the active shard is
+     * what we call the last (highest indexed) shard in the bundle. Then
+     * returns a reference to this shard
+     * @return &object last shard in the bundle
      */
-    public function addPartitionWordData($partition,
-        &$word_data, $overwrite = false)
-    {
-        $start_time = microtime();
-
-        $block_data = $this->readPartitionInfoBlock($partition);
-
-        if(isset($this->diagnostics['INFO_BLOCKS_TIME'])) {
-            $this->diagnostics['INFO_BLOCKS_TIME'] +=
-                changeInMicrotime($start_time);
+     public function &getActiveShard()
+     {
+        if($this->setCurrentShard($this->generation_info['ACTIVE'])) {
+            return $this->getCurrentShard();
+        } else if(!isset($this->current_shard) ) {
+            $current_index_shard_file = $this->dir_name."/index".
+                $this->generation_info['CURRENT'];
+            $this->current_shard = new IndexShard($current_index_shard_file,
+                $this->generation_info['CURRENT']*$num_docs_per_generation);
         }
-
-        if($block_data == NULL) {
-            $block_data[self::NAME] = $partition;
-        }
-
-        //update counts set-up add link to offset linked lists
-        $out_data = array();
-        $out_data[0] = array();
-
-        $this->initPartitionIndexFilter($partition);
-
-        foreach($word_data as $word_key => $docs_info) {
-            $start_time = microtime();
+        return $this->current_shard;
+     }

-            $this->addPartitionIndexFilter($partition, $word_key);
-            $this->addPartitionIndexFilter(
-                $partition, $word_key . $this->generation_info['ACTIVE']);
-            if(isset($this->diagnostics['ADD_FILTER_TIME'])) {
-                $this->diagnostics['ADD_FILTER_TIME'] +=
-                    changeInMicrotime($start_time);
+    /**
+     * Returns the shard which is currently being used to read word-document
+     * data from the bundle. If one wants to write data to the bundle use
+     * getActiveShard() instead. The point of this method is to allow
+     * for lazy reading of the file associated with the shard.
+     *
+     * @return &object the currently being index shard
+     */
+     public function &getCurrentShard()
+     {
+        if(!isset($this->current_shard)) {
+            if(!isset($this->generation_info['CURRENT'])) {
+                $this->generation_info['CURRENT'] =
+                    $this->generation_info['ACTIVE'];
             }
-
-            if(!isset($block_data[$word_key]) || $overwrite == true) {
-                unset($block_data[$word_key]);
-                $block_data[$word_key][self::COUNT] = 0;
-                $block_data[$word_key][self::END_BLOCK] = array();
-                $block_data[$word_key][self::LIST_OFFSET] = NULL;
-                $unfilled_block_num = 0;
-
+            $current_index_shard_file = $this->dir_name."/index".
+                $this->generation_info['CURRENT'];
+
+            if(file_exists($current_index_shard_file) ) {
+                $this->current_shard =
+                    IndexShard::load($current_index_shard_file);
             } else {
-                $unfilled_block_num =
-                    floor($block_data[$word_key][self::COUNT] / BLOCK_SIZE);
+                $this->current_shard = new IndexShard($current_index_shard_file,
+                    $this->generation_info['CURRENT']*
+                    $this->num_docs_per_generation);
             }
-
-            $cnt = count($docs_info);
-            $block_data[$word_key][self::COUNT] += $cnt;
-
-            $tmp =
-                array_merge($block_data[$word_key][self::END_BLOCK],$docs_info);
-            uasort($tmp, "docRankOrderCallback");
-            $add_cnt = count($tmp);
-            $num_blocks = floor($add_cnt / BLOCK_SIZE);
-            $block_data[$word_key][self::END_BLOCK] =
-                array_slice($tmp, $num_blocks*BLOCK_SIZE);
-
-            $first_common_flag = true;
-            $min_common = NULL;
-            $slice_cnt = $num_blocks - 1;
-            for($i = $unfilled_block_num + $num_blocks - 1;
-                $i >= $unfilled_block_num ; $i--) {
-                $out_data[0][$word_key .":". $i] =
-                    array_slice($tmp, $slice_cnt*BLOCK_SIZE, BLOCK_SIZE);
-                if(($i+1)*BLOCK_SIZE > COMMON_WORD_THRESHOLD) {
-                    $min_common = $i;
-                    if($first_common_flag) {
-                        if(isset($block_data[$word_key][self::LIST_OFFSET])) {
-                            $out_data[0][$word_key .":". $i][self::LIST_OFFSET]=
-                                $block_data[$word_key][self::LIST_OFFSET];
-                        } else {
-                            $out_data[0][$word_key .":". $i][self::LIST_OFFSET]=
-                                NULL;
-                        }
-                        $first_common_flag = false;
-                    } else {
-                        $out_data[0][$word_key .":". $i][self::LIST_OFFSET] =
-                            NULL; // next in list is in same block
-                    }
-                }
-
-                $slice_cnt--;
-            }
-            if($min_common !== NULL) {
-                $out_data[
-                    0][$word_key .":". $min_common][self::POINT_BLOCK] = 0;
-                // this index needs to point to previous block with word
-            }
-
         }
-
-        $start_time = microtime();
-        $this->index->addObjectsPartition("offset", $partition,
-            $out_data, $block_data, "setOffsetPointers", false);
-
-        if(isset($this->diagnostics['ADD_OBJECTS_TIME'])) {
-            $this->diagnostics['ADD_OBJECTS_TIME'] +=
-                changeInMicrotime($start_time);
-        }
-
-
-        if($this->generation_info['NUM_WORDS']>$this->num_words_per_generation){
-            $index_filter_size = $this->index->filter_size;
-            $this->generation_info['ACTIVE']++;
-            $this->generation_info['NUM_WORDS'] = 0;
-            $this->index = new WebArchiveBundle(
-                $this->dir_name."/index".$this->generation_info['ACTIVE'],
-                $index_filter_size, $this->num_partitions_index);
-            file_put_contents(
-                $this->dir_name."/generation.txt",
-                serialize($this->generation_info));
-        }
-
-    }
+        return $this->current_shard;
+     }

     /**
-     * Adds the provided $word_key to the BloomFilter for the given partition
+     * Sets the current shard to be the $i th shard in the index bundle.
      *
-     * @param int $partition whose Bloom Filter we want to add the word_key to
-     * @param string $word_key the key to add
-     * @return bool whether the add was successful
+     * @param $i which shard to set the current shard to be
      */
-    public function addPartitionIndexFilter($partition, $word_key)
-    {
-        if($this->initPartitionIndexFilter($partition) === false) {
+     public function setCurrentShard($i)
+     {
+        if(isset($this->generation_info['CURRENT']) &&
+            $i == $this->generation_info['CURRENT'] ||
+            $i > $this->generation_info['ACTIVE']) {
             return false;
+        } else {
+            $this->generation_info['CURRENT'] = $i;
+            return true;
         }
-        if(!$this->index_partition_filters[$partition]->contains($word_key)) {
-            $this->generation_info['NUM_WORDS']++;
-            $this->index_partition_filters[$partition]->add($word_key);
-        }
-
-        return true;
-    }
-
-    /**
-     * Initializes or constructs the Bloom filter assocaited with a partition
-     * @param int $partition index of desired partition
-     * @return bool whether the operation was successful
-     */
-    public function initPartitionIndexFilter($partition)
-    {
-        if(!isset($this->index_partition_filters[$partition])) {
-            if(file_exists($this->dir_name.
-                "/index_filters/partition$partition.ftr")) {
-                $this->index_partition_filters[$partition] =
-                    BloomFilterFile::load(
-                        $this->dir_name .
-                        "/index_filters/partition$partition.ftr");
-            } else {
-                $filter_size = $this->num_words_per_generation;
-                $this->index_partition_filters[$partition] =
-                    new BloomFilterFile(
-                        $this->dir_name .
-                        "/index_filters/partition$partition.ftr", $filter_size);
-            }
-        }
-        return true;
-    }
+     }

     /**
      * Gets the page out of the summaries WebArchiveBundle with the given
@@ -521,44 +308,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
         return $this->summaries->getPage($key, $offset);
     }

-    /**
-     * Returns a block of documents a word occur in. The doc block looked up
-     * is at a given offset into the word's partition WebArchive for a given
-     * generation. This is used when the word occurs more the
-     * COMMON_WORD_THRESHOLD many times in a generation
-     *
-     * @param string $word_key hash of word whose doc block we are looking up
-     * @param int $offset byte offset into word's partition WebArchive for the
-     *      supplied generation
-     * @param int $generation which generation to look up the doc block of
-     * @return array the desired doc block
-     */
-    public function getWordDocBlock($word_key, $offset, $generation = -1)
-    {
-        if($generation == -1) {
-            return $this->index->getPage($word_key, $offset);
-        } else {
-            $archive =
-                new WebArchiveBundle($this->dir_name."/index".$generation);
-            return $archive->getPage($word_key, $offset);
-        }
-    }

-    /**
-     * Gets a page using in WebArchive $partition of the word index
-     * using the provided byte $offset and using existing $file_handle
-     * if possible.
-     *
-     * @param int $partition which WebArchive to look in
-     * @param int $offset byte offset of page data
-     * @param resource $file_handle file handle resource of $partition archive
-     * @return array desired page
-     */
-    public function getPageByPartition($partition, $offset, $file_handle = NULL)
-    {
-        return $this->index->getPageByPartition(
-            $partition, $offset, $file_handle);
-    }

     /**
      * Adds the given summary to the summary exists filter bundle
@@ -588,125 +338,15 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

     /**
      * Forces the data in the page exists filter bundle of summaries
-     * to be save to disk, forces each index partition summary to be saved
+     * to be save to disk, forces the current shard to be saved, the current
+     * filter in the index filter bundle to be save
      */
     public function forceSave()
     {
         $this->summaries->forceSave();
-        for($i = 0; $i < $this->num_partitions_index; $i++) {
-            if(isset($this->index_partition_filters[$i]) &&
-                $this->index_partition_filters[$i] != NULL) {
-                $this->index_partition_filters[$i]->save();
-            }
-        }
+        $this->getActiveShard()->save();
     }

-    /**
-     * Computes statistics for the provided phrase_key.
-     * These include an estimate of the total number of documents it occurs in,
-     * as well as which generations it occurs in, and what are its info block
-     * looks like in the current generation
-     *
-     * @param string $phrase_key what to compute statistics for
-     * @param int $generation_index the current generation
-     * @param array $info_block info_block of the phrase_key (will look up
-     *      if not provided)
-     * @return array info for this $phrase_key
-     */
-    public function getPhraseIndexInfo(
-        $phrase_key, $generation_index = 0, $info_block = NULL)
-    {
-
-        $partition =
-            WebArchiveBundle::selectPartition(
-                $phrase_key, $this->num_partitions_index);
-        $info = array();
-        if($info_block == NULL) {
-
-            if(!$this->initPartitionIndexFilter($partition)) {
-                return NULL;
-            }
-            $filter = & $this->index_partition_filters[$partition];
-
-            if($filter == NULL || !$filter->contains($phrase_key)) {
-                return NULL;
-            }
-
-            $active_generation = $this->generation_info['ACTIVE'];
-
-            $min_generation = 0;
-            for($i = 0; $i <= $active_generation; $i++) {
-                if($filter->contains($phrase_key . $i)) {
-                    if($filter->contains("delete". $phrase_key . $i)) {
-                        $info['GENERATIONS'] = array();
-                        //truncate all previously seen
-                    } else {
-                        $info['GENERATIONS'][] = $i;
-                    }
-                }
-            }
-            $num_generations = count($info['GENERATIONS']);
-            if($num_generations == 0) {
-                return NULL;
-            }
-
-            $sample_size = min($num_generations, SAMPLE_GENERATIONS);
-            $sum_count = 0;
-            for($i = 0; $i < $sample_size; $i++) {
-                $block_info =
-                    $this->readPartitionInfoBlock(
-                        $partition, $info['GENERATIONS'][$i]);
-                $sum_count += $block_info[$phrase_key][self::COUNT];
-            }
-
-            $info['TOTAL_COUNT'] =
-                ceil(($sum_count*$num_generations)/$sample_size);
-                // this is an estimate
-        } else {
-            $info['TOTAL_COUNT'] = $info_block['TOTAL_COUNT'];
-            $info['GENERATIONS'] = $info_block['GENERATIONS'];
-        }
-
-        $block_info = $this->readPartitionInfoBlock(
-            $partition, $info['GENERATIONS'][$generation_index]);
-        $phrase_info = $block_info[$phrase_key];
-
-        $info['CURRENT_GENERATION_INDEX'] = $generation_index;
-
-        if(isset($phrase_info)) {
-            $phrase_info['CURRENT_GENERATION_INDEX'] =
-                $info['CURRENT_GENERATION_INDEX'];
-            $phrase_info['TOTAL_COUNT'] = $info['TOTAL_COUNT'];
-            $phrase_info['GENERATIONS'] = $info['GENERATIONS'];
-            return $phrase_info;
-        } else {
-            return NULL;
-        }
-
-    }
-
-    /**
-     * Sets the information associated with a word in the inverted index
-     *
-     * @param string $phrase_key
-     * @param array $info
-     */
-    public function setPhraseIndexInfo($phrase_key, $info)
-    {
-        $partition = WebArchiveBundle::selectPartition(
-            $phrase_key, $this->num_partitions_index);
-
-        $partition_block_data = $this->readPartitionInfoBlock($partition);
-
-        if($partition_block_data == NULL || !is_array($partition_block_data)) {
-            $partition_block_data = array();
-        }
-
-        $partition_block_data[$phrase_key] = $info;
-
-        $this->writePartitionInfoBlock($partition, $partition_block_data);
-
-    }

     /**
      * Computes the words which appear in the fewest or most documents
@@ -723,12 +363,11 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
         if(!is_array($word_keys) || count($word_keys) < 1) { return NULL;}

         foreach($word_keys as $word_key) {
-            $info = $this->getPhraseIndexInfo($word_key);
-            if(isset($info['TOTAL_COUNT'])) {
-                $words_array[$word_key] = $info['TOTAL_COUNT'];
-            } else {
+            $tmp = $this->getCurrentShard()->getWordInfo($word_key);
+            if($tmp === false) {
                 $words_array[$word_key] = 0;
-                return NULL;
+            } else {
+                $words_array[$word_key] = $tmp[2];
             }
         }

@@ -737,34 +376,6 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
         return array_slice($words_array, 0, $num);
     }

-    /**
-     * Reads the info block of $partition index WebArchive
-     *
-     * @param int $partition WebArchive to read from
-     * @return array data in its info block
-     */
-    public function readPartitionInfoBlock($partition, $generation = -1)
-    {
-        if($generation == -1) {
-            return $this->index->readPartitionInfoBlock($partition);
-        } else {
-            $archive = new WebArchiveBundle(
-                $this->dir_name."/index".$generation);
-            return $archive->readPartitionInfoBlock($partition);
-        }
-
-    }
-
-    /**
-     * Write $data into the info block of the $partition index WebArchive
-     *
-     * @param int $partition WebArchive to write into
-     * @param array $data what to write
-     */
-    public function writePartitionInfoBlock($partition, $data)
-    {
-        $this->index->writePartitionInfoBlock($partition, $data);
-    }

     /**
      * Gets the description, count of summaries, and number of partions of the
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index 74adf2085..d5ace8fa8 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -78,6 +78,9 @@ class GroupIterator extends IndexBundleIterator
      */
     var $count_block;

+    /**
+     *
+     */
     var $current_block_hashes;

     /**
@@ -98,13 +101,11 @@ class GroupIterator extends IndexBundleIterator
      *
      * @param object $index_bundle_iterator to use as a source of documents
      *      to iterate over
-     * @param int $limit the first element to return from the list of docs
-     *      iterated over
+
      */
-    function __construct($index_bundle_iterator, $limit = 0)
+    function __construct($index_bundle_iterator)
     {
         $this->index_bundle_iterator = $index_bundle_iterator;
-        $this->limit = $limit;
         $this->num_docs = $this->index_bundle_iterator->num_docs;
         $this->reset();
     }
@@ -116,26 +117,10 @@ class GroupIterator extends IndexBundleIterator
     function reset()
     {
         $this->index_bundle_iterator->reset();
-        $time = time();
         $this->grouped_keys = array();
             // -1 == never save, so file name not used using time to be safer
         $this->seen_docs = 0;
         $this->seen_docs_unfiltered = 0;
-        $beneath_limit = true;
-        while($beneath_limit == true) {
-
-            $doc_block = $this->currentDocsWithWord();
-            if($doc_block == -1 || !is_array($doc_block)) {
-                $beneath_limit = false;
-                continue;
-            }
-            if($this->seen_docs + $this->count_block >= $this->limit) {
-                $beneath_limit = false;
-                continue;
-            }
-            $this->advance();
-        }
-
     }

     /**
@@ -148,6 +133,7 @@ class GroupIterator extends IndexBundleIterator
     {
         $pages =
             $this->index_bundle_iterator->currentDocsWithWord();
+
         $this->count_block_unfiltered = count($pages);
         if(!is_array($pages)) {
             return $pages;
@@ -159,28 +145,28 @@ class GroupIterator extends IndexBundleIterator
         if($this->count_block_unfiltered > 0 ) {
             $i = $this->seen_docs;
             foreach($pages as $doc_key => $doc_info) {
-                if(!is_array($doc_info)) {continue;}
+                if(!is_array($doc_info) ||
+                    isset($doc_info[self::DUPLICATE])) {continue;}
                 $doc_info['KEY'] = $doc_key;
-                $doc_key_parts = explode(":", $doc_key);
-                if(count($doc_key_parts) == 1) {
-                    $hash_url = $doc_key_parts[0];
+                if(strlen($doc_key) == 8) {
+                    $hash_url = $doc_key;
                     $doc_info['IS_PAGE'] = true;
                 } else {
+                    $doc_key_parts = array(
+                        substr($doc_key, 0, 8),substr($doc_key, 9, 8),
+                        substr($doc_key, 18, 8)
+                    );
                     $hash_url = $doc_key_parts[1];
                     $doc_info['IS_PAGE'] = false;
                 }
                 if(isset($this->grouped_keys[$hash_url])) {
-                    if( $i < $this->limit) {
-                        continue;
-                    } else {
-                        if(isset($pre_out_pages[$hash_url]) ) {
-                            $pre_out_pages[$hash_url][] = $doc_info;
-                            if($doc_info['IS_PAGE'] == true) {
-                                $pre_out_pages[$hash_url]['IS_PAGE'] = true;
-                            } else {
-                                $pre_out_pages[$hash_url]['HASH_INFO_URL'] =
-                                    $doc_key_parts[2];
-                            }
+                    if(isset($pre_out_pages[$hash_url]) ) {
+                        $pre_out_pages[$hash_url][] = $doc_info;
+                        if($doc_info['IS_PAGE'] == true) {
+                            $pre_out_pages[$hash_url]['IS_PAGE'] = true;
+                        } else {
+                            $pre_out_pages[$hash_url]['HASH_INFO_URL'] =
+                                $doc_key_parts[2];
                         }
                     }
                 } else {
@@ -202,17 +188,17 @@ class GroupIterator extends IndexBundleIterator
                     $hash_info_url= $pre_out_pages[$hash_url]['HASH_INFO_URL'];
                     $word_iterator =
                          new WordIterator($hash_info_url,
-                            $this->getIndex(), 0);
+                            $this->getIndex(), true);
                     $doc_array = $word_iterator->currentDocsWithWord();
                     if(is_array($doc_array) && count($doc_array) == 1) {
                         $keys = array_keys($doc_array);
                         $key = $keys[0];
-                        if($doc_array[$key][self::DOC_RANK] > -1) {
+                        if(!isset($doc_array[$key][self::DUPLICATE]) ) {
                             $pre_out_pages[$hash_url][$key] = $doc_array[$key];
                             $pre_out_pages[$hash_url][$key]['IS_PAGE'] = true;
                         } else {
                             /*
-                                Deduplication: idea is if the score < 0
+                                Deduplication:
                                 a deduplicate info: page was written, so
                                 we should ignore that group.
                             */
@@ -230,15 +216,6 @@ class GroupIterator extends IndexBundleIterator
             }
             $this->count_block = count($pre_out_pages);

-            if($this->seen_docs  <  $this->limit) {
-                $total_docs = $this->seen_docs + $this->count_block;
-                if($total_docs <  $this->limit) {
-                    $pre_out_pages =array();
-                } else {
-                    $pre_out_pages = array_slice($pre_out_pages,
-                        $this->limit - $this->seen_docs, NULL, true);
-                }
-            }
             $out_pages = array();
             foreach($pre_out_pages as $hash_url => $group_infos) {
                 foreach($group_infos as $doc_info) {
@@ -312,7 +289,7 @@ class GroupIterator extends IndexBundleIterator
                 $out_pages[$doc_key] = $doc_info;
                 foreach($doc_info[self::SUMMARY_OFFSET] as $offset_array) {
                     list($key, $summary_offset) = $offset_array;
-                    $index = $this->getIndex($key);
+                    $index = & $this->getIndex($key);
                     $page = $index->getPage(
                         $key, $summary_offset);
                     if(!isset($out_pages[$doc_key][self::SUMMARY])) {
@@ -358,9 +335,9 @@ class GroupIterator extends IndexBundleIterator

     /**
      * Returns the index associated with this iterator
-     * @return object the index
+     * @return &object the index
      */
-    function getIndex($key = NULL)
+    function &getIndex($key = NULL)
     {
         return $this->index_bundle_iterator->getIndex($key);
     }
diff --git a/lib/index_bundle_iterators/index_bundle_iterator.php b/lib/index_bundle_iterators/index_bundle_iterator.php
index cd33b23c5..9bb7fec8c 100644
--- a/lib/index_bundle_iterators/index_bundle_iterator.php
+++ b/lib/index_bundle_iterators/index_bundle_iterator.php
@@ -63,13 +63,6 @@ abstract class IndexBundleIterator implements IndexingConstants, CrawlConstants
      */
     var $seen_docs;

-    /**
-     * First document that should be returned
-     * amongst all of the documents associated with the
-     * iterator's $word_key
-     * @var int
-     */
-    var $limit;
     /**
      * The number of documents in the current block
      * @var int
@@ -104,7 +97,7 @@ abstract class IndexBundleIterator implements IndexingConstants, CrawlConstants
      * Returns the index associated with this iterator
      * @return object the index
      */
-    abstract function getIndex($key = NULL);
+    abstract function &getIndex($key = NULL);

     /**
      * Hook function used by currentDocsWithWord to return the current block
diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php
index a53ece59c..6bfa74121 100644
--- a/lib/index_bundle_iterators/intersect_iterator.php
+++ b/lib/index_bundle_iterators/intersect_iterator.php
@@ -102,13 +102,10 @@ class IntersectIterator extends IndexBundleIterator
      *
      * @param object $index_bundle_iterator to use as a source of documents
      *      to iterate over
-     * @param int $limit the first element to return from the list of docs
-     *      iterated over
      */
-    function __construct($index_bundle_iterators, $limit = 0)
+    function __construct($index_bundle_iterators)
     {
         $this->index_bundle_iterators = $index_bundle_iterators;
-        $this->limit = $limit;

         $this->num_iterators = count($index_bundle_iterators);
         $this->num_docs = -1;
@@ -138,19 +135,8 @@ class IntersectIterator extends IndexBundleIterator

         $this->seen_docs = 0;
         $this->seen_docs_unfiltered = 0;
-        $beneath_limit = true;
-        while($beneath_limit == true) {
-            $doc_block = $this->currentDocsWithWord();
-            if($doc_block == -1 || !is_array($doc_block)) {
-                $beneath_limit = false;
-                continue;
-            }
-            if($this->seen_docs + $this->count_block >= $this->limit) {
-                $beneath_limit = false;
-                continue;
-            }
-            $this->advance();
-        }
+        $doc_block = $this->currentDocsWithWord();
+
     }

     /**
@@ -294,7 +280,7 @@ class IntersectIterator extends IndexBundleIterator
      * Returns the index associated with this iterator
      * @return object the index
      */
-    function getIndex($key = NULL)
+    function &getIndex($key = NULL)
     {
         return $this->index_bundle_iterators[0]->getIndex($key = NULL);
     }
diff --git a/lib/index_bundle_iterators/phrase_filter_iterator.php b/lib/index_bundle_iterators/phrase_filter_iterator.php
index 36b98dddf..a844c3e9d 100644
--- a/lib/index_bundle_iterators/phrase_filter_iterator.php
+++ b/lib/index_bundle_iterators/phrase_filter_iterator.php
@@ -119,14 +119,11 @@ class PhraseFilterIterator extends IndexBundleIterator
      *      phrases
      * @param float $weight a quantity to multiply each score returned from
      *      this iterator with
-     * @param int $limit the first element to return from the list of docs
-     *      iterated over
      */
     function __construct($index_bundle_iterator, $restrict_phrases,
-        $disallow_phrases, $weight = 1, $limit = 0)
+        $disallow_phrases, $weight = 1)
     {
         $this->index_bundle_iterator = $index_bundle_iterator;
-        $this->limit = $limit;
         $this->restrict_phrases = $restrict_phrases;
         $this->disallow_phrases = $disallow_phrases;
         $this->num_docs = $this->index_bundle_iterator->num_docs;
@@ -144,19 +141,7 @@ class PhraseFilterIterator extends IndexBundleIterator
         $this->index_bundle_iterator->reset();
         $this->seen_docs = 0;
         $this->seen_docs_unfiltered = 0;
-        $beneath_limit = true;
-        while($beneath_limit == true) {
-            $doc_block = $this->currentDocsWithWord();
-            if($doc_block == -1 || !is_array($doc_block)) {
-                $beneath_limit = false;
-                continue;
-            }
-            if($this->seen_docs + $this->count_block > $this->limit) {
-                $beneath_limit = false;
-                continue;
-            }
-            $this->advance();
-        }
+        $doc_block = $this->currentDocsWithWord();
     }

     /**
@@ -224,15 +209,6 @@ class PhraseFilterIterator extends IndexBundleIterator
         }
         $this->count_block = count($pages);

-        if($this->seen_docs < $this->limit) {
-            $total_docs = $this->seen_docs + $this->count_block;
-            if($total_docs <  $this->limit) {
-                $pages =array();
-            } else {
-                $pages = array_slice($pages,
-                    $this->limit - $this->seen_docs, NULL, true);
-            }
-        }
         $this->summaries = $pages;
         $this->pages = array();
         foreach($pages as $doc_key => $doc_info) {
@@ -301,9 +277,9 @@ class PhraseFilterIterator extends IndexBundleIterator

     /**
      * Returns the index associated with this iterator
-     * @return object the index
+     * @return &object the index
      */
-    function getIndex($key = NULL)
+    function &getIndex($key = NULL)
     {
         return $this->index_bundle_iterator->getIndex($key = NULL);
     }
diff --git a/lib/index_bundle_iterators/union_iterator.php b/lib/index_bundle_iterators/union_iterator.php
index bfbb3b5ac..4c79ec0a0 100644
--- a/lib/index_bundle_iterators/union_iterator.php
+++ b/lib/index_bundle_iterators/union_iterator.php
@@ -96,13 +96,10 @@ class UnionIterator extends IndexBundleIterator
      *
      * @param object $index_bundle_iterator to use as a source of documents
      *      to iterate over
-     * @param int $limit the first element to return from the list of docs
-     *      iterated over
      */
-    function __construct($index_bundle_iterators, $limit = 0)
+    function __construct($index_bundle_iterators)
     {
         $this->index_bundle_iterators = $index_bundle_iterators;
-        $this->limit = $limit;
         /*
             estimate number of results by sum of all iterator counts,
             then improve estimate as iterate
@@ -127,19 +124,7 @@ class UnionIterator extends IndexBundleIterator

         $this->seen_docs = 0;
         $this->seen_docs_unfiltered = 0;
-        $beneath_limit = true;
-        while($beneath_limit == true) {
-            $doc_block = $this->currentDocsWithWord();
-            if($doc_block == -1 || !is_array($doc_block)) {
-                $beneath_limit = false;
-                continue;
-            }
-            if($this->seen_docs + $this->count_block >= $this->limit) {
-                $beneath_limit = false;
-                continue;
-            }
-            $this->advance();
-        }
+        $doc_block = $this->currentDocsWithWord();
     }

     /**
@@ -199,7 +184,6 @@ class UnionIterator extends IndexBundleIterator
             $keys = array_keys($this->pages);
         }
         $out_pages = array();
-        echo "hello".$this->pages[$key[0]]["ITERATOR"]."<br/>";
         foreach($keys as $doc_key) {
             if(!isset($this->pages[$doc_key]["ITERATOR"])) {
                 continue;
@@ -238,9 +222,9 @@ class UnionIterator extends IndexBundleIterator

     /**
      * Returns the index associated with this iterator
-     * @return object the index
+     * @return &object the index
      */
-    function getIndex($key = NULL)
+    function &getIndex($key = NULL)
     {
         if($key != NULL) {
             if($this->current_block_fresh == false) {
diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php
index 36a028246..a1e31358d 100644
--- a/lib/index_bundle_iterators/word_iterator.php
+++ b/lib/index_bundle_iterators/word_iterator.php
@@ -90,73 +90,61 @@ class WordIterator extends IndexBundleIterator
     var $index;

     /**
-     * If iterating through the linked-list portions of the documents
-     * the next byte offset in the WebArchive based linked-list
+     * The next byte offset in the IndexShard
      * @var int
      */
     var $next_offset;
-    /**
-     * Block number of the last block of docs
-     * @var int
-     */
-    var $last_pointed_block;
-    /**
-     * @var int
-     */
-    var $list_offset;

     /**
-     * Pointers to offsets for blocks containing docs with the given word
-     * for the current generation
-     * @var array
-     */
-    var $block_pointers;
-    /**
-     * Number of completely full blocks of documents for the current generation
+     * The current byte offset in the IndexShard
      * @var int
      */
-    var $num_full_blocks;
+    var $current_offset;
+
     /**
-     * Number of generations word appears in
+     * Last Offset of word occurence in the IndexShard
      * @var int
      */
-    var $num_generations;
+    var $last_offset;
+
     /**
-     * Used to store the contents of the last partially full block
+     * Keeps track of whether the word_iterator list is empty becuase the
+     * word does not appear in the index shard
      * @var int
      */
-    var $last_block;
-    /**
-     * the info block of the WebArchive that the word lives in
-     * @var object
-     */
-    var $info_block;
+    var $empty;
+
     /**
-     * Stores the number of the current block of documents we are at in the
-     * set of all blocks of BLOCK_SIZE many documents
+     *  Number of documents returned for each block (at most)
      * @var int
      */
-    var $current_pointer;
+    const RESULTS_PER_BLOCK = 2000;

     /**
      * Creates a word iterator with the given parameters.
      *
      * @param string $word_key hash of word or phrase to iterate docs of
-     * @param object $index the IndexArchiveBundle to use
+     * @param object &$index the IndexArchiveBundle to use
      * @param int $limit the first element to return from the list of docs
      *      iterated over
-     * @param object $info_block the info block of the WebArchive
-     *      associated with the word in the index. If NULL, then this will
-     *      loaded in WordIterator::reset()
+     * @param bool $raw whether the $word_key is our variant of base64 encoded
      */
-    function __construct($word_key, $index, $limit = 0, $info_block = NULL)
+    function __construct($word_key, &$index, $raw = false)
     {
         $this->word_key = $word_key;
-        $this->index = $index;
-        $this->limit = $limit;
-        $this->info_block = $info_block;
+
+        $this->index = & $index;
         $this->current_block_fresh = false;
-        $this->reset();
+        $tmp = $index->getCurrentShard()->getWordInfo($word_key, $raw);
+        if ($tmp === false) {
+            $this->empty = true;
+        } else {
+            list($this->current_offset, $this->last_offset, $this->num_docs)
+                = $tmp;
+            $this->empty = false;
+
+            $this->reset();
+        }
     }

     /**
@@ -168,99 +156,9 @@ class WordIterator extends IndexBundleIterator
     {
         $this->count_block = 0;
         $this->seen_docs = 0;
-
-        $partition =
-            WebArchiveBundle::selectPartition($this->word_key,
-                $this->index->num_partitions_index);
-        if($this->info_block == NULL) {
-            $this->info_block =
-                $this->index->getPhraseIndexInfo($this->word_key);
-        }
-        if($this->info_block !== NULL) {
-            $this->num_generations = count($this->info_block['GENERATIONS']);
-            $count_till_generation = $this->info_block[self::COUNT];
-
-            while($this->limit >= $count_till_generation) {
-                $this->info_block['CURRENT_GENERATION_INDEX']++;
-                if($this->num_generations <=
-                    $this->info_block['CURRENT_GENERATION_INDEX']) {
-                    $this->num_docs = 0;
-                    $this->current_pointer = -1;
-                    return;
-                }
-                $info_block = $this->index->getPhraseIndexInfo(
-                    $this->word_key,
-                    $this->info_block['CURRENT_GENERATION_INDEX'],
-                    $this->info_block);
-                if($info_block !== NULL) {
-                    $this->info_block = $info_block;
-                }
-                $count_till_generation += $this->info_block[self::COUNT];
-            }
-            $this->seen_docs = $count_till_generation -
-                $this->info_block[self::COUNT];
-
-        }
-
-
-        $this->initGeneration();
-

     }

-    /**
-     * Sets up the iterator to iterate through the current generation.
-     *
-     * @return bool whether the initialization succeeds
-     */
-    function initGeneration()
-    {
-
-        if($this->info_block !== NULL) {
-            $info_block = $this->index->getPhraseIndexInfo(
-                $this->word_key, $this->info_block['CURRENT_GENERATION_INDEX'],
-                $this->info_block);
-            if($info_block === NULL) {
-                return false;
-            }
-            $this->info_block = & $info_block;
-            $this->num_docs = $info_block['TOTAL_COUNT'];
-            $this->num_docs_generation = $info_block[self::COUNT];
-
-            $this->current_pointer =
-                max(floor(($this->limit - $this->seen_docs) / BLOCK_SIZE), 0);
-            $this->seen_docs += $this->current_pointer*BLOCK_SIZE;
-            $this->last_block = $info_block[self::END_BLOCK];
-            $this->num_full_blocks =
-                floor($this->num_docs_generation / BLOCK_SIZE);
-            if($this->num_docs_generation > COMMON_WORD_THRESHOLD) {
-                $this->last_pointed_block =
-                    floor(COMMON_WORD_THRESHOLD / BLOCK_SIZE);
-            } else {
-                $this->last_pointed_block = $this->num_full_blocks;
-            }
-
-            for($i = 0; $i < $this->last_pointed_block; $i++) {
-                if(isset($info_block[$i])) {
-                    $this->block_pointers[$i] = $info_block[$i];
-                }
-            }
-
-            if($this->num_docs_generation > COMMON_WORD_THRESHOLD) {
-                if($info_block[self::LIST_OFFSET] === NULL) {
-                    $this->list_offset = NULL;
-                } else {
-                    $this->list_offset = $info_block[self::LIST_OFFSET];
-                }
-            }
-
-        } else {
-            $this->num_docs = 0;
-            $this->num_docs_generation = 0;
-            $this->current_pointer = -1;
-        }
-        return true;
-    }

     /**
      * Hook function used by currentDocsWithWord to return the current block
@@ -270,96 +168,13 @@ class WordIterator extends IndexBundleIterator
      */
     function findDocsWithWord()
     {
-        if($this->num_generations <=
-            $this->info_block['CURRENT_GENERATION_INDEX']) {
-            $this->pages = NULL;
-            return -1;
-        }
-        $generation =
-            $this->info_block['GENERATIONS'][
-                $this->info_block['CURRENT_GENERATION_INDEX']];
-        if($this->current_pointer >= 0) {
-            if($this->current_pointer == $this->num_full_blocks) {
-                $pages = $this->last_block;
-            } else if ($this->current_pointer >= $this->last_pointed_block) {
-                /* if there are more than COMMON_WORD_THRESHOLD many
-                   results and we're not at the last block yet
-                 */
-                if($this->list_offset === NULL) {
-                    $this->pages = NULL;
-                    return -1;
-                }
-                $offset = $this->list_offset;
-                $found = false;
-                do {
-                    /* the link list is actually backwards to the order we want
-                       For now, we cycle along the list from the last data
-                       stored until we find the block we want. This is slow
-                       but we are relying on the fact that each generation is
-                       not too big.
-                     */
-                    $doc_block = $this->index->getWordDocBlock($this->word_key,
-                        $offset, $generation);
-                    $word_keys = array_keys($doc_block);
-                    $found_key = NULL;
-                    foreach($word_keys as $word_key) {
-                        if(strstr($word_key, $this->word_key.":")) {
-                            $found_key = $word_key;
-                            if(isset($doc_block[
-                                $found_key][self::LIST_OFFSET])) {
-                                //only one list offset/docblock
-                                break;
-                            }
-                        }
-                    }
-                    if($found_key === NULL) {
-                        break;
-                    }
-                    if(isset($doc_block[
-                        $this->word_key.":".$this->current_pointer])) {
-                        $found = true;
-                        break;
-                    }
-                    $offset = $doc_block[$found_key][self::LIST_OFFSET];
-                } while($offset != NULL);
-                if($found != true) {
-                    $pages = array();
-                } else {
-                    $pages = & $doc_block[
-                        $this->word_key.":".$this->current_pointer];
-                }
-            } else {
-                //first COMMON_WORD_THRESHOLD many results fast
-                if(isset($this->block_pointers[$this->current_pointer])) {
-                    $doc_block = $this->index->getWordDocBlock($this->word_key,
-                        $this->block_pointers[$this->current_pointer],
-                        $generation);
-                    if(isset(
-                        $doc_block[$this->word_key.":".$this->current_pointer]
-                        )) {
-                        $pages = &
-                            $doc_block[
-                                $this->word_key.":".$this->current_pointer];
-                    } else {
-                        $pages = array();
-                    }
-                } else {
-                    $pages = array();
-                }
-            }
-
-            if($this->seen_docs < $this->limit) {
-                $diff_offset = $this->limit - $this->seen_docs;
-
-                $pages = array_slice($pages, $diff_offset);
-            }
-            $this->pages = & $pages;
-            $this->count_block = count($pages);
-            return $pages;
-        } else {
-            $this->pages = NULL;
+        if($this->current_offset > $this->last_offset || $this->empty) {
             return -1;
         }
+        $this->next_offset = $this->current_offset;
+        $results = $this->index->getCurrentShard()->getWordSlice(
+            $this->next_offset, $this->last_offset, self::RESULTS_PER_BLOCK);
+        return $results;
     }


@@ -368,30 +183,21 @@ class WordIterator extends IndexBundleIterator
      */
     function advance()
     {
-        if($this->current_pointer < 0) {return;}
-
         $this->advanceSeenDocs();
-
-        $this->current_pointer ++;
-        if($this->current_pointer > $this->num_full_blocks) {
-            $flag = false;
-            while ($this->info_block['CURRENT_GENERATION_INDEX'] <
-                $this->num_generations - 1 && !$flag) {
-                $this->info_block['CURRENT_GENERATION_INDEX']++;
-                $flag = $this->initGeneration();
-            }
-            if ($this->info_block['CURRENT_GENERATION_INDEX'] >=
-                $this->num_generations - 1) {
-                $this->current_pointer = - 1;
-            }
+        if($this->current_offset < $this->next_offset) {
+            $this->current_offset = $this->next_offset;
+        } else {
+            $this->current_offset = $this->last_offset + 1;
         }
+
+
     }

     /**
      * Returns the index associated with this iterator
-     * @return object the index
+     * @return &object the index
      */
-    function getIndex($key = NULL)
+    function &getIndex($key = NULL)
     {
         return $this->index;
     }
diff --git a/lib/index_shard.php b/lib/index_shard.php
index e903bc5b1..d26ba0d3c 100644
--- a/lib/index_shard.php
+++ b/lib/index_shard.php
@@ -34,6 +34,23 @@
 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

 /**
+ * Read in base class, if necessary
+ */
+require_once "persistent_structure.php";
+
+/**
+ * Load charCopy
+ */
+require_once "utility.php";
+
+/**
+ *Loads common constants for web crawling
+ */
+require_once  BASE_DIR.'/lib/crawl_constants.php';
+
+/**
+ * Data structure used to store one generation worth of the word document
+ * index (inverted index).
  *
  * @author Chris Pollett
  *
@@ -41,51 +58,474 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
  * @subpackage library
  */

-class IndexShard extends PersistentStructure implements Serializable
+class IndexShard extends PersistentStructure implements CrawlConstants
 {
-    var $doc_ids;
+    /**
+     * Stores document id's and links to documents id's together with
+     * summary offset information, and number of words in the doc/link
+     * The format for a record is 8 bytes for a doc id, 1 bit is
+     * a link record flag, 31 bits for the summary offset (byte offset into
+     * web archive of the data for this document) and 4 bytes
+     * for number of words in doc. In the case of a link, there is
+     * an 8 byte link hash followed by the link record flag bit being on,
+     * followed by 31 bits for the summary offset, followed by  8 bytes for
+     * the hash of the url being pointed to by the link, followed by 8
+     * bytes for the hash of "info:url_pointed_to_by_link", followed by 4 bytes
+     * for numbers of word in link.
+     * @var string
+     */
+    var $doc_infos;
+    /**
+     *  Length of $doc_infos as a string
+     *  @var int
+     */
+    var $docids_len;
+    /**
+     * A string consisting of interwoven linked-lists. A given linked-list
+     * store all the documents containing a given word. The format
+     * of a record in such a list consists of: 3 byte offset into $doc_infos
+     * for the document, followed by 1 byte recording the number of occurrence
+     * of the word in the document, followed by a four byte next pointer into
+     * the $word_docs string of the next record in the linked-list.
+     * @var string
+     */
     var $word_docs;
-    var $count_doc256;
+    /**
+     *  Length of $word_docs as a string
+     *  @var int
+     */
+    var $word_docs_len;
+    /**
+     * Used to store information about a word in this index shard.
+     * $words is an associative array, the key being an 8 byte word hash,
+     * the value being a 12 byte record. The first 4 bytes of this record
+     * being the offset to the start of the linked-list for that word in
+     * $word_docs, the next 4 bytes of this record being the last record
+     * for this word in the link-list, and the last 4 bytes recording the
+     * number of records in this linked-list.
+     *
+     * @var array
+     */
+    var $words;
+
+    /**
+     * This is supposed to hold the number of documents that have been stored
+     * in earlier shards, prior to the current shard.
+     */
+    var $generation_offset;
+    /**
+     * Number of documents (not links) stored in this shard
+     * @var int
+     */
+    var $num_docs;
+    /**
+     * Number of links (not documents) stored in this shard
+     * @var int
+     */
+    var $num_link_docs;
+    /**
+     * Number of words stored in total in all documents in this shard
+     * @var int
+     */
+    var $len_all_docs;
+    /**
+     * Number of words stored in total in all links in this shard
+     * @var int
+     */
+    var $len_all_link_docs;

-    function __construct()
+    /**
+     * Used to keep track of whether a record in document infos is for a
+     * document or for a link
+     */
+    const COMPOSITE_ID_FLAG =  0x80000000;
+
+    /**
+     * Makes an index shard with the given file name and generation offset
+     *
+     * @param $fname filename to store the index shard with
+     * @param $generation_offset when returning documents from the shard
+     *      pretend there ar ethis many earlier documents
+     */
+    function __construct($fname, $generation_offset = 0)
     {
+        parent::__construct($fname, -1);
+        $this->generation_offset = $generation_offset;
+        $this->word_docs = "";
+        $this->word_docs_len = 0;
+        $this->words = array();
+        $this->docids_len = 0;
+        $this->doc_infos = "";
+        $this->num_docs = 0;
+        $this->num_link_docs = 0;
+        $this->len_all_docs = 0;
+        $this->len_all_link_docs = 0;
     }
-
-    function addDocumentWords($doc_id, $word_id_array)
+
+    /**
+     * Add a new document to the index shard with the given summary offset.
+     * Associate with this document the supplied list of words and word counts.
+     * Finally, associate the given meta words with this document.
+     *
+     * @param string $doc_id id of document to insert
+     * @param int $summary_offset its offset into the word archive its data
+     *      is stored in
+     * @param array $word_counts (word => number of occurences of word) pairs
+     *      for each word in the document
+     * @param array $meta_ids meta words to be associated with the document
+     *      an example meta word would be filetype:pdf for a PDF document.
+     */
+    function addDocumentWords($doc_id, $summary_offset, $word_counts,
+        $meta_ids)
     {
-        $this->doc_ids[] = $doc_id;
-
-        foreach($word_id_arr as $word_id => $relevance) {
-            $relevance = $relevance & 255;
-            $store = pack("N", $this->count_doc256 + $relevance);
-            $this->word_docs[$word_id] .= $store;
+        $is_doc = false;
+        $doc_len = 0;
+        $link_doc_len = 0;
+        if(strlen($doc_id) == 8) { //actual doc case
+            $this->doc_infos .= $doc_id . pack("N", $summary_offset);
+            $extra_offset = 0;
+            $this->num_docs++;
+            $is_doc = true;
+        } else { //link item
+            if(strlen($doc_id) !== 26) {
+                return false;
+            }
+            $id_parts = array(substr($doc_id, 0, 8),
+                substr($doc_id, 9, 8), substr($doc_id, 18, 8));
+            $this->num_link_docs++;
+            $this->doc_infos .= $id_parts[0] . pack("N",
+                ($summary_offset | self::COMPOSITE_ID_FLAG)) .
+                $id_parts[1] . $id_parts[2];
+            $extra_offset = 16;
+        }
+        foreach($meta_ids as $meta_id) {
+            $word_counts[$meta_id] = 0;
+        }
+        foreach($word_counts as $word => $occurrences) {
+            $word_id = crawlHash($word, true);
+            $occurrences = ($occurrences > 255 ) ? 255 : $occurrences & 255;
+            $store =  pack("N", ($this->docids_len << 4) + $occurrences);
+            $store .= pack("N", $this->word_docs_len);
+            if(!isset($this->words[$word_id])) {
+                $value = pack("N", $this->word_docs_len);
+                $value .= $value.pack("N", 1);
+            } else {
+                $value = $this->words[$word_id];
+                $first_string = substr($value, 0, 4);
+                $previous_string = substr($value, 4, 4);
+                $count_array = unpack("N", substr($value, 8, 4));
+                $count =  $count_array[1];
+                if($count == 0x7FFFFFFF) { continue; }
+                $count++;
+                $value = $first_string . pack("N", $this->word_docs_len) .
+                    pack("N", $count);
+                $tmp = unpack("N", $previous_string);
+                $previous = $tmp[1];
+                $previous_info = substr($this->word_docs, $previous, 8);
+                $previous_doc_occ = substr($previous_info, 0, 4);
+                $offset = $this->word_docs_len - $previous;
+                $previous_info = $previous_doc_occ.pack("N", $offset);
+                charCopy($previous_info, $this->word_docs, $previous, 8);
+            }
+            $this->words[$word_id] = $value;
+            $this->word_docs .= $store;
+            $this->word_docs_len += 8;
+            if($occurrences > 0) {
+                if($is_doc == true) {
+                    $doc_len += $occurrences;
+                } else {
+                    $link_doc_len += $occurrences;
+                }
+            }
         }

-        $this->count_doc256 += 256;
+        $this->len_all_docs += $doc_len;
+        $this->len_all_link_docs += $link_doc_len;
+        if($is_doc == true)  {
+            $this->doc_infos .= pack("N", $doc_len);
+        } else {
+            $this->doc_infos .= pack("N", $link_doc_len);
+        }
+        $this->docids_len += 16 + $extra_offset;
     }

-    function getWordSlice($word_id, $start, $len)
+    /**
+     * Returns the first offset, last offset, and number of documents the
+     * word occurred in for this shard. The first offset (similarly, the last
+     * offset) is the byte offset into the word_docs string of the first
+     * (last) record involving that word.
+     *
+     * @param string $word_id id of the word one wants to look up
+     * @param bool $raw whether the id is our version of base64 encoded or not
+     */
+    function getWordInfo($word_id, $raw = false)
     {
-        $result = array();
-        if(isset($word_docs[$word_id])) {
-            $docs_string = substr($word_docs[$word_id], $start << 2, $len <<2);
-            //check if got at least one item
-            if($docs_string !== false && ($doc_len = strlen($doc_string)) > 3) {
-                for($i = 0; $i < $doc_len; $i += 4) {
+
+        if($raw == false) {
+            //get rid of out modfied base64 encoding
+            $hash = str_replace("_", "/", $word_id);
+            $hash = str_replace("-", "+" , $hash);
+            $hash .= "=";
+            $word_id = base64_decode($hash);
+
+        }
+
+        if(!isset($this->words[$word_id])) {
+            return false;
+        }
+        $first_string = substr($this->words[$word_id], 0, 4);
+        $tmp = unpack("N", $first_string);
+        $first_offset = $tmp[1];
+        $last_string = substr($this->words[$word_id], 4, 4);
+        $tmp = unpack("N", $last_string);
+        $last_offset = $tmp[1];
+        $count_string = substr($this->words[$word_id], 8, 4);
+        $tmp = unpack("N", $count_string);
+        $count = $tmp[1];
+
+
+        return array($first_offset, $last_offset, $count);
+
+    }
+
+    /**
+     * Returns documents using the word_docs string of records starting
+     * at the given offset and using its link-list of records. Traversal of
+     * the list stops if an offset larger than $last_offset is seen or
+     * $len many doc's have been returned. Since $next_offset is passed by
+     * reference the value of $next_offset will point to the next record in
+     * the list (if it exists) after thhe function is called.
+     *
+     * @param int &$next_offset where to start in word docs
+     * @param int $last_offset offset at which to stop by
+     * @param int $len number of documents desired
+     * @return array desired list of doc's and their info
+     */
+    function getWordSlice(&$next_offset, $last_offset, $len)
+    {
+        $num_docs_so_far = 0;
+        $num_doc_or_links =  ($next_offset > 0) ? $last_offset/$next_offset
+            : 1; //very approx
+        $results = array();
+        do {
+            if($next_offset >= $this->word_docs_len) {break;}
+            $item = array();
+            $doc_string = substr($this->word_docs, $next_offset, 4);
+            $tmp = unpack("N", $doc_string);
+            $doc_int = $tmp[1];
+            $occurrences = $doc_int & 255;
+            $doc_index = ($doc_int >> 8);
+            $next_string = substr($this->word_docs, $next_offset + 4, 4);
+            $tmp = unpack("N", $next_string);
+            $old_next_offset = $next_offset;
+            $next_offset += $tmp[1];
+            $doc_depth = log(10*(($doc_index +1) +
+                $this->generation_offset)*NUM_FETCHERS, 10);
+            $item[self::DOC_RANK] = number_format(11 -
+                $doc_depth, PRECISION);
+            $doc_loc = $doc_index << 4;
+            $doc_info_string = substr($this->doc_infos, $doc_loc,
+                12);
+            $doc_id = substr($doc_info_string, 0, 8);
+            $tmp = unpack("N", substr($doc_info_string, 8, 4));
+            $item[self::SUMMARY_OFFSET] = $tmp[1];
+            $is_doc = false;
+            $skip_stats = false;
+
+            if($item[self::SUMMARY_OFFSET] == 0x7FFFFFFF) {
+                $skip_stats = true;
+                $item[self::DUPLICATE] = true;
+            } else if(($tmp[1] & self::COMPOSITE_ID_FLAG) !== 0) {
+                //handles link item case
+                $item[self::SUMMARY_OFFSET] ^= self::COMPOSITE_ID_FLAG;
+                $doc_loc += 12;
+                $doc_info_string = substr($this->doc_infos, $doc_loc, 16);
+                $doc_id .= ":".
+                    substr($doc_info_string, 0, 8).":".
+                    substr($doc_info_string, 8, 8);
+                $average_doc_len = ($this->num_link_docs != 0) ?
+                    $this->len_all_link_docs/$this->num_link_docs : 0;
+                $num_docs = $this->num_link_docs;
+            } else {
+                $is_doc = true;
+                $average_doc_len = $this->len_all_docs/$this->num_docs;
+                $num_docs = $this->num_docs;
+            }
+
+            if(!$skip_stats) {
+                $tmp = unpack("N",  substr($this->doc_infos, $doc_loc + 12, 4));
+                $doc_len = $tmp[1];
+                $doc_ratio = ($average_doc_len > 0) ?
+                    $doc_len/$average_doc_len : 0;
+                $pre_relevance = number_format(
+                        3 * $occurrences/
+                        ($occurrences + .5 + 1.5* $doc_ratio),
+                        PRECISION);
+                $num_term_occurrences = $num_doc_or_links *
+                    $num_docs/($this->num_docs + $this->num_link_docs);
+                $IDF = ($num_docs - $num_term_occurrences + 0.5) /
+                    ($num_term_occurrences + 0.5);
+                $item[self::RELEVANCE] = $IDF * $pre_relevance;
+                $item[self::SCORE] = $item[self::DOC_RANK] +
+                    .1*$item[self::RELEVANCE];
+            }
+            $results[$doc_id] = $item;
+            $num_docs_so_far ++;
+
+        } while ($next_offset<= $last_offset && $num_docs_so_far < $len
+            && $next_offset > $old_next_offset);
+
+        return $results;
+    }
+
+
+    /**
+     * Returns $len many documents which contained the word corresponding to
+     * $word_id
+     *
+     * @param string $word_id key to look up documents for
+     * @param int number of documents desired back (from start of word linked
+     *      list).
+     * @return array desired list of doc's and their info
+     */
+    function getWordSliceById($word_id, $len)
+    {
+        $results = array();
+        if(isset($this->words[$word_id])) {
+            list($first_offset, $last_offset,
+                $num_docs_or_links) = $this->getWordInfo($word_id, true);
+            $results = $this->getWordSlice($first_offset, $last_offset, $len);
+        }
+        return $results;
+    }
+
+    /**
+     * Adds the contents of the supplied $index_shard to the current index
+     * shard
+     *
+     * @param object &$index_shard the shard to append to the current shard
+     */
+    function appendIndexShard(&$index_shard)
+    {
+        $this->doc_infos .= $index_shard->doc_infos;
+        $this->word_docs .= $index_shard->word_docs;
+        $old_word_docs_len = $this->word_docs_len;
+        $this->word_docs_len += $index_shard->word_docs_len;
+        // update doc offsets in word_docs for newly added docs
+        for($i = $old_word_docs_len; $i < $this->word_docs_len; $i += 8) {
+            $doc_occurrences_string = substr($this->word_docs, $i, 4);
+            $tmp = unpack("N", $doc_occurrences_string);
+            $num = $tmp[1];
+            $num += ($this->docids_len << 4);
+            $doc_occurrences_string = pack("N", $num);
+            charCopy($doc_occurrences_string, $this->word_docs, $i, 4);
+        }
+
+        foreach($index_shard->words as $word_key => $word_docs_offset) {
+            $add_first_string = substr($word_docs_offset, 0, 4);
+            $tmp = unpack("N", $add_first_string);
+            $add_first_offset = $tmp[1];
+            $add_last_string = substr($word_docs_offset, 4, 4);
+            $tmp = unpack("N", $add_last_string);
+            $add_last_offset = $tmp[1];
+            $add_count = substr($word_docs_offset, 8, 4);
+            $tmp = unpack("N", $add_count);
+            $add_count = $tmp[1];
+            if(!isset($this->words[$word_key])) {
+                $new_word_docs_offset =
+                    pack("N", $old_word_docs_len + $add_first_offset).
+                    pack("N", $old_word_docs_len + $add_last_offset).
+                    pack("N", $add_count);
+            } else {
+                $value = $this->words[$word_key];
+                $first_string = substr($value, 0, 4);
+                $last_string = substr($value, 4, 4);
+                $tmp = unpack("N", $last_string);
+                $last_offset = $tmp[1];
+                $count_string = substr($value, 8, 4);
+                $tmp = unpack("N", $count_string);
+                $count = $tmp[1];
+                if($count == 0x7FFFFFFF) {
+
+                    continue;
                 }
+                $to_new_docs_offset = $add_first_offset
+                   + ($old_word_docs_len - $last_offset);
+                $to_new_docs_string = pack("N", $to_new_docs_offset);
+                charCopy($to_new_docs_string, $this->word_docs,
+                    $last_offset + 4, 4);
+                $new_word_docs_offset = $first_string .
+                    pack("N", $old_word_docs_len + $add_last_offset) .
+                    pack("N", $count + $add_count);
             }
+            $this->words[$word_key] = $new_word_docs_offset;
         }

-        return $result;
+        $this->docids_len += $index_shard->docids_len;
+        $this->num_docs += $index_shard->num_docs;
+        $this->num_link_docs += $index_shard->num_link_docs;
+        $this->len_all_docs += $index_shard->len_all_docs;
+        $this->len_all_link_docs += $index_shard->len_all_link_docs;
     }

-    function appendIndexShard($index_shard)
+    /**
+     * Changes the summary offsets associated with a set of doc_ids to new
+     * values. This is needed because the fetcher puts documents in a
+     * shard before sending them to a queue_server. It is on the queue_server
+     * however where documents are stored in the IndexArchiveBundle and
+     * summary offsets are obtained. Thus, the shard needs to be updated at
+     * that point.
+     *
+     * @param array $docid_offsets a set of doc_id offset pairs.
+     */
+    function changeDocumentOffsets($docid_offsets)
     {
+        $docids_len = $this->docids_len;
+
+        for($i = 0 ; $i < $docids_len; $i += $row_len) {
+            $row_len = 16;
+            $id = substr($this->doc_infos, $i, 8);
+            $tmp = unpack("N", substr($this->doc_infos, $i + 8, 4));
+            $offset = $tmp[1];
+            if($offset == 0x7FFFFFFF) {continue; }//ignore duplicates
+            $comp_flag = 0;
+            if(($offset & self::COMPOSITE_ID_FLAG) !== 0) {
+                //handle link item case
+                $row_len += 16;
+                $comp_flag = self::COMPOSITE_ID_FLAG;
+                $id .= ":".substr($this->doc_infos, $i + 12, 8) . ":" .
+                    substr($this->doc_infos, $i + 20, 8);
+            }
+            $new_offset = (isset($docid_offsets[$id])) ?
+                pack("N", ($docid_offsets[$id] | $comp_flag)) :
+                pack("N", $offset);
+
+            charCopy($new_offset, $this->doc_infos, $i + 8, 4);
+        }
     }

-    function docCount()
+    /**
+     * Marks a set of urls as duplicates of urls previously seen
+     * To do this the url's doc_id has associated with a summary
+     * offset of value 0x7FFFFFFF, and its length is set to
+     * 0XFFFFFFFF
+     *
+     * @param array $doc_urls urls to mark as duplicates.
+     */
+    function markDuplicateDocs($doc_urls)
     {
-        return ($this->count_doc256 >> 8);
+        foreach($doc_urls as $duplicate) {
+            $doc_key = crawlHash($duplicate, true);
+            $this->doc_infos .= $doc_key . pack("N", 0x7FFFFFFF).
+                pack("N", 0xFFFFFFFF);
+            $word_key = crawlHash("info:".$duplicate, true);
+            $this->word_docs .= pack("N", ($this->docids_len<< 4)).pack("N",0);
+            $tmp = pack("N", $this->word_docs_len);
+            $this->words[$word_key] = $tmp.$tmp.pack("N", 0x7FFFFFFF);
+            $this->word_docs_len += 8;
+            $this->docids_len += 16;
+        }
+
     }

 }
diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php
index 9d2846fe7..1dc3ad5f8 100755
--- a/lib/processors/html_processor.php
+++ b/lib/processors/html_processor.php
@@ -80,7 +80,7 @@ class HtmlProcessor extends TextProcessor
                 if(strlen($summary[self::DESCRIPTION] . $summary[self::TITLE])
                     == 0 && count($summary[self::LINKS]) == 0) {
                     //maybe not html? treat as text still try to get urls
-                    $summary = parent::process($page, url);
+                    $summary = parent::process($page, $url);
                 }
             }
         }
diff --git a/lib/string_array.php b/lib/string_array.php
index 6d830f865..8c34fb028 100755
--- a/lib/string_array.php
+++ b/lib/string_array.php
@@ -38,6 +38,11 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
  */
 require_once "persistent_structure.php";

+/**
+ * Load charCopy
+ */
+require_once "utility.php";
+
 /**
  * Memory efficient implementation of persistent arrays
  *
@@ -159,12 +164,10 @@ class StringArray extends PersistentStructure
         $data_size = $this->data_size;

         $start = $i * $data_size;
-        $end = $start + $data_size;

-        for($j = $start, $k = 0; $j < $end; $j++, $k++) {
-            $this->string_array[$j] = $data[$k];
-        }
+        charCopy($data, $this->string_array, $start, $data_size);
+
     }
-
+
 }
 ?>
diff --git a/lib/utility.php b/lib/utility.php
index 376f3cc08..1694054be 100755
--- a/lib/utility.php
+++ b/lib/utility.php
@@ -22,7 +22,7 @@
  *
  *  END LICENSE
  *
- * A library of log, hash, and time functions
+ * A library of string, log, hash, and time functions
  *
  * @author Chris Pollett chris@pollett.org
  * @package seek_quarry
@@ -35,6 +35,46 @@

 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

+/**
+ *
+ */
+function charCopy($source, &$destination, $start, $length)
+{
+    $end = $start + $length;
+    for($j = $start, $k = 0; $j < $end; $j++, $k++) {
+        $destination[$j] = $source[$k];
+    }
+}
+
+/**
+ *
+ */
+function vByteEncode($pos_int)
+{
+    $result = chr($pos_int & 127);
+    $pos_int >>= 7;
+    while($pos_int > 0){
+        $result .= chr(128 | ($pos_int & 127));
+        $pos_int >>= 7;
+    }
+    return $result;
+}
+
+/**
+ *
+ */
+function vByteDecode(&$str, &$offset)
+{
+    $pos_int = ord($str[$offset] & 127) ;
+    $shift = 7;
+    while (ord($str[$offset++]) & 128 > 0) {
+        $pos_int += (ord($str[$offset] & 127) << $shift);
+        $shift += 7;
+    }
+
+    return $pos_int;
+}
+
 /**
  *  Logs a message to a logfile or the screen
  *
diff --git a/models/phrase_model.php b/models/phrase_model.php
index b92584645..a604642e5 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -57,7 +57,7 @@ foreach(glob(BASE_DIR."/lib/index_bundle_iterators/*_iterator.php")
 /**
  *
  * This is class is used to handle
- * db results for a given phrase search
+ * results for a given phrase search
  *
  * @author Chris Pollett
  * @package seek_quarry
@@ -151,7 +151,7 @@ class PhraseModel extends Model
         $index_archive = new IndexArchiveBundle(
             CRAWL_DIR.'/cache/'.$index_archive_name);
         $word_iterator =
-            new WordIterator(crawlHash("info:$url"), $index_archive, 0);
+            new WordIterator(crawlHash("info:$url"), $index_archive);
         $num_retrieved = 0;
         $pages = array();
         $summary_offset = NULL;
@@ -262,6 +262,7 @@ class PhraseModel extends Model
             $hashes = array_unique($hashes);
             $restrict_phrases = array_unique($restrict_phrases);
             $restrict_phrases = array_filter($restrict_phrases);
+            $index_archive->setCurrentShard(0);
             $words_array = $index_archive->getSelectiveWords($hashes, 10);

             if(is_array($words_array)) {
@@ -348,12 +349,56 @@ class PhraseModel extends Model
      *      INDEX_ARCHIVE -- an index_archive object to get results from
      * @param int $limit number of first document in order to return
      * @param int $num number of documents to return summaries of
-     * @param object $index_archive index archive to use to get summaries from
      * @return array document summaries
      */
     function getSummariesByHash($word_structs, $limit, $num)
     {

+        $pages = array();
+        $generation = 0;
+        $to_retrieve = $limit + max(2*$num, 200);
+        $num_retrieved = 0;
+        while($num_retrieved < $to_retrieve) {
+            $gen_pages = $this->getGenerationSummariesByHash(
+                $word_structs, $to_retrieve, $generation);
+             if(!is_array($gen_pages)) { break; }
+             $num_retrieved += count($gen_pages);
+             $pages += $gen_pages;
+             $generation++;
+        }
+        uasort($pages, "scoreOrderCallback");
+        $pages = array_slice($pages, $limit, $num);
+        if($num_retrieved < $to_retrieve ) {
+            $results['TOTAL_ROWS'] = $num_retrieved;
+        } else {
+            $results['TOTAL_ROWS'] =
+                $num_retrieved;
+            //num_docs is only approximate, so if gives contradictory info
+              //use $num_retrieved
+        }
+        $results['PAGES'] = & $pages;
+        return $results;
+    }
+
+
+    /**
+     * Gets doc summaries of documents containing given words and meeting the
+     * additional provided criteria in a given index shard generation
+     * @param array $word_structs an array of word_structs. Here a word_struct
+     *      is an associative array with at least the following fields
+     *      KEYS -- an array of word keys
+     *      RESTRICT_PHRASES -- an array of phrases the document must contain
+     *      DISALLOW_PHRASES -- an array of words the document must not contain
+     *      WEIGHT -- a weight to multiple scores returned from this iterator by
+     *      INDEX_ARCHIVE -- an index_archive object to get results from
+     * @param int $num number of documents to return summaries of
+     * @param int $generation the index of the generation to get summaries from
+     * @return array document summaries
+     */
+    function getGenerationSummariesByHash($word_structs,
+        $num, $generation)
+    {
+
         $iterators = array();
         foreach($word_structs as $word_struct) {
             if(!is_array($word_struct)) { continue;}
@@ -361,25 +406,29 @@ class PhraseModel extends Model
             $restrict_phrases = $word_struct["RESTRICT_PHRASES"];
             $disallow_phrases = $word_struct["DISALLOW_PHRASES"];
             $index_archive = $word_struct["INDEX_ARCHIVE"];
+            if($generation > $index_archive->generation_info['ACTIVE']) {
+                continue;
+            }
+            $index_archive->setCurrentShard($generation);
             $weight = $word_struct["WEIGHT"];
             $num_word_keys = count($word_keys);
             if($num_word_keys < 1) {continue;}

             for($i = 0; $i < $num_word_keys; $i++) {
                 $word_iterators[$i] =
-                    new WordIterator($word_keys[$i], $index_archive, 0);
+                    new WordIterator($word_keys[$i], $index_archive);
             }
             if($num_word_keys == 1) {
                 $base_iterator = $word_iterators[0];
             } else {
-                $base_iterator = new IntersectIterator($word_iterators, 0);
+                $base_iterator = new IntersectIterator($word_iterators);
             }
             if($restrict_phrases == NULL && $disallow_phrases == NULL &&
                 $weight == 1) {
                 $iterators[] = $base_iterator;
             } else {
                 $iterators[] = new PhraseFilterIterator($base_iterator,
-                    $restrict_phrases, $disallow_phrases, $weight, 0);
+                    $restrict_phrases, $disallow_phrases, $weight);
             }

         }
@@ -389,38 +438,26 @@ class PhraseModel extends Model
         } else if($num_iterators == 1) {
             $union_iterator = $iterators[0];
         } else {
-            $union_iterator = new UnionIterator($iterators, 0);
+            $union_iterator = new UnionIterator($iterators);
         }

-        $to_retrieve = $limit + max(2*$num, 200);
-        $group_iterator = new GroupIterator($union_iterator, 0);
+        $to_retrieve =  max(2*$num, 200);
+        $group_iterator = new GroupIterator($union_iterator);
         $num_retrieved = 0;
         $pages = array();
         while(is_array($next_docs = $group_iterator->nextDocsWithWord()) &&
-            $num_retrieved < $to_retrieve) {
+            $num_retrieved < $num) {
              foreach($next_docs as $doc_key => $doc_info) {
                  $summary = & $doc_info[CrawlConstants::SUMMARY];
                  unset($doc_info[CrawlConstants::SUMMARY]);
                  $pages[] = array_merge($doc_info, $summary);
                  $num_retrieved++;
                  if($num_retrieved >=  $to_retrieve) {
-
                      break 2;
                  }
              }
         }
-        uasort($pages, "scoreOrderCallback");
-        $pages = array_slice($pages, $limit, $num);
-        if($num_retrieved < $to_retrieve && $limit<=$group_iterator->num_docs) {
-            $results['TOTAL_ROWS'] = $num_retrieved;
-        } else {
-            $results['TOTAL_ROWS'] = max($group_iterator->num_docs,
-                $num_retrieved);
-            /*num_docs is only approximate, so if gives contradictory info
-              use $num_retrieved */
-        }
-        $results['PAGES'] = $pages;
-        return $results;
+        return $pages;
     }

 }
diff --git a/views/search_view.php b/views/search_view.php
index 71cadab4b..b0f5b1472 100755
--- a/views/search_view.php
+++ b/views/search_view.php
@@ -84,6 +84,8 @@ class SearchView extends View implements CrawlConstants
         <div class="searchbox">
         <form id="searchForm" method="get" action=''>
         <p>
+        <input type="hidden" name="YIOOP_TOKEN" value="<?php
+            e($data['YIOOP_TOKEN']); ?>" />
         <input type="hidden" name="its" value="<?php e($data['its']); ?>" />
         <input type="text" title="<?php e(tl('search_view_input_label')); ?>"
             id="search-name" name="q" value="<?php if(isset($data['QUERY'])) {

ViewGit