Rewrites WebArchiveBundle partitioning system to match index shard generations, a=cpollett

Chris Pollett [2010-10-21 19:Oct:st]
Rewrites WebArchiveBundle partitioning system to match index shard generations, a=cpollett
Filename
bin/fetcher.php
bin/queue_server.php
controllers/archive_controller.php
controllers/search_controller.php
lib/bst_array.php
lib/crawl_constants.php
lib/index_archive_bundle.php
lib/index_bundle_iterators/group_iterator.php
lib/index_bundle_iterators/index_bundle_iterator.php
lib/web_archive_bundle.php
models/crawl_model.php
models/phrase_model.php
tests/bst_array_test.php
views/search_view.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 0051b5caf..f6b6119e8 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -305,7 +305,7 @@ class Fetcher implements CrawlConstants
                     crawlLog("Old name: ".$this->web_archive->dir_name);
                 }
                 $this->web_archive = new WebArchiveBundle($tmp_base_name,
-                    URL_FILTER_SIZE, NUM_ARCHIVE_PARTITIONS);
+                    URL_FILTER_SIZE);
                 $this->crawl_time = $info[self::CRAWL_TIME];
                 $this->sum_seen_title_length = 0;
                 $this->sum_seen_description_length = 0;
@@ -740,7 +740,7 @@ class Fetcher implements CrawlConstants
                 $i++;
             }
         } // end for
-        $stored_site_pages = $this->web_archive->addPages(self::HASH,
+        $cache_page_partition = $this->web_archive->addPages(
             self::OFFSET, $stored_site_pages);

         $num_pages = count($stored_site_pages);
@@ -750,6 +750,8 @@ class Fetcher implements CrawlConstants
             if(isset($stored_site_pages[$i][self::OFFSET])) {
                 $summarized_site_pages[$i][self::OFFSET] =
                     $stored_site_pages[$i][self::OFFSET];
+                $summarized_site_pages[$i][self::CACHE_PAGE_PARTITION] =
+                    $cache_page_partition;
             }
         }

diff --git a/bin/queue_server.php b/bin/queue_server.php
index e666d3fe9..5d2037eeb 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -416,8 +416,7 @@ class QueueServer implements CrawlConstants
             $this->index_archive = new IndexArchiveBundle(
                 CRAWL_DIR.'/cache/'.
                     self::index_data_base_name.$this->crawl_time,
-                URL_FILTER_SIZE, NUM_ARCHIVE_PARTITIONS,
-                serialize($info));
+                URL_FILTER_SIZE, serialize($info));
         } else {
             $dir = CRAWL_DIR.'/cache/'.
                     self::index_data_base_name.$this->crawl_time;
@@ -589,11 +588,14 @@ class QueueServer implements CrawlConstants
             }
         }

-        if(isset($seen_sites)) {
-            $seen_sites =
-                $this->index_archive->addPages(
-                    self::HASH_URL, self::SUMMARY_OFFSET, $seen_sites,
-                    $visited_urls_count);
+        if(isset($seen_sites) && isset($sites[self::INVERTED_INDEX])) {
+            $index_shard = & $sites[self::INVERTED_INDEX];
+            $generation =
+                $this->index_archive->initGenerationToAdd($index_shard);
+
+            $this->index_archive->addPages(
+                $generation, self::SUMMARY_OFFSET, $seen_sites,
+                $visited_urls_count);

             $summary_offsets = array();
             foreach($seen_sites as $site) {
@@ -604,16 +606,13 @@ class QueueServer implements CrawlConstants
                 " time: ".(changeInMicrotime($start_time)));
             $start_time = microtime();
             // added summary offset info to inverted index data
-            if(isset($sites[self::INVERTED_INDEX])) {
-                $index_shard = & $sites[self::INVERTED_INDEX];
-                $index_shard->changeDocumentOffsets($summary_offsets);
-            }
-        }
-        crawlLog("C (update shard offsets) memory usage".memory_get_usage() .
-            " time: ".(changeInMicrotime($start_time)));
-        $start_time = microtime();

-        if(isset($index_shard)) {
+            $index_shard->changeDocumentOffsets($summary_offsets);
+
+            crawlLog("C (update shard offsets) memory usage".memory_get_usage().
+                " time: ".(changeInMicrotime($start_time)));
+            $start_time = microtime();
+
             $this->index_archive->addIndexData($index_shard);
             $this->index_dirty = true;
         }
diff --git a/controllers/archive_controller.php b/controllers/archive_controller.php
index de00247aa..12a6a5063 100755
--- a/controllers/archive_controller.php
+++ b/controllers/archive_controller.php
@@ -98,8 +98,9 @@ class ArchiveController extends Controller implements CrawlConstants
     {
         $web_archive = new WebArchiveBundle(
             CRAWL_DIR.'/cache/'.self::archive_base_name.
-                $_REQUEST['crawl_time'], -1);
-        $page = $web_archive->getPage($_REQUEST['hash'], $_REQUEST['offset']);
+                $_REQUEST['crawl_time']);
+        $page = $web_archive->getPage($_REQUEST['offset'],
+            $_REQUEST['partition']);

         echo base64_encode(serialize($page));
     }
diff --git a/controllers/search_controller.php b/controllers/search_controller.php
index 716ab82ee..688722bea 100755
--- a/controllers/search_controller.php
+++ b/controllers/search_controller.php
@@ -172,8 +172,12 @@ class SearchController extends Controller implements CrawlConstants
                 if(isset($_REQUEST['so'])) {
                     $summary_offset = $this->clean($_REQUEST['so'], "int");
                 }
-                $this->cacheRequest($query, $arg, $summary_offset, $highlight,
-                    $index_time_stamp);
+                $generation = -1;
+                if(isset($_REQUEST['g'])) {
+                    $generation = $this->clean($_REQUEST['g'], "int");
+                }
+                $this->cacheRequest($query, $arg, $summary_offset, $generation,
+                    $highlight, $index_time_stamp);
             }
         }

@@ -226,12 +230,16 @@ class SearchController extends Controller implements CrawlConstants
                 if(isset($_REQUEST['so'])) {
                     $summary_offset = $this->clean($_REQUEST['so'], "int");
                 }
-                if($summary_offset === NULL) {
-                    $summary_offset =
-                        $this->phraseModel->lookupSummaryOffset($url);
+                $generation = -1;
+                if(isset($_REQUEST['g'])) {
+                    $generation = $this->clean($_REQUEST['g'], "int");
+                }
+                if($summary_offset === NULL || $generation == -1) {
+                    list($summary_offset, $generation) =
+                        $this->phraseModel->lookupSummaryOffsetGeneration($url);
                 }
-                $crawl_item = $this->crawlModel->getCrawlItem(
-                    crawlHash($url, true), $summary_offset);
+                $crawl_item = $this->crawlModel->getCrawlItem($summary_offset,
+                    $generation);

                 $top_phrases  =
                     $this->phraseModel->getTopPhrases($crawl_item, 3);
@@ -318,7 +326,7 @@ class SearchController extends Controller implements CrawlConstants
      * @param int $crawl_time the timestamp of the crawl to look up the cached
      *      page in
      */
-    function cacheRequest($query, $url, $summary_offset,
+    function cacheRequest($query, $url, $summary_offset = -1, $generation = -1,
         $highlight=true, $crawl_time = 0)
     {

@@ -328,24 +336,26 @@ class SearchController extends Controller implements CrawlConstants

         $this->phraseModel->index_name = $crawl_time;
         $this->crawlModel->index_name = $crawl_time;
-        if($summary_offset === NULL) {
-            $summary_offset = $this->phraseModel->lookupSummaryOffset($url);
+        if($summary_offset == -1 || $generation == -1) {
+            list($summary_offset, $generation) =
+                $this->phraseModel->lookupSummaryOffsetGeneration($url);
         }

-        if(!$crawl_item = $this->crawlModel->getCrawlItem(crawlHash($url, true),
-            $summary_offset)) {
+        $data = array();
+        if(!$crawl_item = $this->crawlModel->getCrawlItem($summary_offset,
+            $generation)) {

             $this->displayView("nocache", $data);
             exit();
         }

-        $data = array();
+
         $machine = $crawl_item[self::MACHINE];
         $machine_uri = $crawl_item[self::MACHINE_URI];
         $page = $crawl_item[self::HASH];
         $offset = $crawl_item[self::OFFSET];
         $cache_item = $this->crawlModel->getCacheFile($machine,
-            $machine_uri, $page, $offset, $crawl_time);
+            $machine_uri, $generation, $offset,  $crawl_time);

         $cache_file = $cache_item[self::PAGE];

diff --git a/lib/bst_array.php b/lib/bst_array.php
deleted file mode 100644
index 9e41f3c5f..000000000
--- a/lib/bst_array.php
+++ /dev/null
@@ -1,149 +0,0 @@
-<?php
-/**
- *  SeekQuarry/Yioop --
- *  Open Source Pure PHP Search Engine, Crawler, and Indexer
- *
- *  Copyright (C) 2009, 2010  Chris Pollett chris@pollett.org
- *
- *  LICENSE:
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  END LICENSE
- *
- * @author Chris Pollett chris@pollett.org
- * @package seek_quarry
- * @subpackage library
- * @license http://www.gnu.org/licenses/ GPL3
- * @link http://www.seekquarry.com/
- * @copyright 2009, 2010
- * @filesource
- */
-if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
-
-/**
- * Load charCopy
- */
-require_once "utility.php";
-
-/**
- *
- * @author Chris Pollett
- *
- * @package seek_quarry
- * @subpackage library
- */
-
-class BSTArray
-{
-    var $data;
-    var $data_len;
-    var $key_len;
-    var $value_len;
-    var $entry_len;
-    var $key_compare;
-
-    /**
-     *
-     */
-    function __construct($key_len, $value_len, $key_compare)
-    {
-        $this->data = "";
-        $this->data_len = 0;
-        $this->key_len = $key_len;
-        $this->value_len = $value_len;
-        $this->entry_len = $key_len + $value_len + 8;
-        $this->key_compare = $key_compare;
-    }
-
-    /**
-     *
-     */
-    function insertUpdate($key, $value)
-    {
-        $key_compare = $this->key_compare;
-        if($this->contains($key, $offset, $parent_offset))
-        {
-            list(, , $left_offset, $right_offset) = $this->readEntry($offset);
-            charCopy($key . $value . pack("N",$left_offset) .
-                pack("N", $right_offset),$this->data, $offset,$this->entry_len);
-        } else {
-            if($parent_offset != $offset) { // data already exists
-                list($parent_key, $parent_value, $parent_left_offset,
-                    $parent_right_offset) = $this->readEntry($parent_offset);
-                if($key_compare($parent_key, $key) < 0 ) {
-                    $parent_right_offset = $offset;
-                } else {
-                    $parent_left_offset = $offset;
-                }
-                $new_parent_entry =  $parent_key . $parent_value .
-                    pack("N", $parent_left_offset) .
-                    pack("N", $parent_right_offset);
-                charCopy( $new_parent_entry,
-                    $this->data, $parent_offset, $this->entry_len);
-            }
-            $this->data .= $key . $value . pack("H*", "7FFFFFFF7FFFFFFF");
-            $this->data_len += $this->entry_len;
-        }
-    }
-
-    /**
-     *
-     */
-    function contains($key, &$offset, &$parent_offset)
-    {
-        $offset = 0;
-        $parent_offset = 0;
-        $data_len = $this->data_len;
-        $entry_len = $this->entry_len;
-        $last_entry = $data_len - $entry_len;
-        $key_compare = $this->key_compare;
-        while($offset <= $last_entry ) {
-            list($cur_key, , $left_offset, $right_offset) =
-                $this->readEntry($offset);
-            $comparison = $key_compare($cur_key, $key);
-            if($comparison == 0) {
-                return true;
-            } else if ($comparison < 0) {
-                $parent_offset = $offset;
-                $offset = $right_offset;
-            } else {
-                $parent_offset = $offset;
-                $offset = $left_offset;
-            }
-        }
-
-        $offset = $data_len;
-        return false;
-    }
-
-    /**
-     *
-     */
-    function readEntry($offset)
-    {
-        $key = substr($this->data, $offset, $this->key_len);
-        $offset += $this->key_len;
-        $value = substr($this->data, $offset, $this->value_len);
-        $offset += $this->value_len;
-        $left_string = substr($this->data, $offset, 4);
-        $tmp = unpack("N", $left_string);
-        $left_offset = $tmp[1];
-        $offset += 4;
-        $right_string = substr($this->data, $offset, 4);
-        $tmp = unpack("N", $right_string);
-        $right_offset = $tmp[1];
-        return array($key, $value, $left_offset, $right_offset);
-    }
-}
diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php
index 87ff70ccc..49cb8accf 100644
--- a/lib/crawl_constants.php
+++ b/lib/crawl_constants.php
@@ -138,5 +138,6 @@ interface CrawlConstants

     const NEEDS_OFFSET_FLAG = 0x7FFFFFFE;
     const DUPLICATE_FLAG = 0x7FFFFFFF;
+    const CACHE_PAGE_PARTITION = 'aq';
 }
 ?>
diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php
index 2835f5e95..228a0872a 100644
--- a/lib/index_archive_bundle.php
+++ b/lib/index_archive_bundle.php
@@ -127,14 +127,11 @@ class IndexArchiveBundle implements CrawlConstants
      *      the WebArchiveBundles
      * @param int $num_partitions_summaries number of WebArchive partitions
      *      to use in the summmaries WebArchiveBundle
-     * @param int $num_partitions_index number of WebArchive partitions
-     *      to use in the index WebArchiveBundle
      * @param string $description a text name/serialized info about this
      * IndexArchiveBundle
      */
     public function __construct($dir_name, $filter_size = -1,
-        $num_partitions_summaries = NULL, $description = NULL,
-        $num_docs_per_generation = NUM_DOCS_PER_GENERATION)
+        $description = NULL, $num_docs_per_generation = NUM_DOCS_PER_GENERATION)
     {

         $this->dir_name = $dir_name;
@@ -147,7 +144,6 @@ class IndexArchiveBundle implements CrawlConstants
             $index_archive_exists = true;

         }
-
         if(file_exists($this->dir_name."/generation.txt")) {
             $this->generation_info = unserialize(
                 file_get_contents($this->dir_name."/generation.txt"));
@@ -157,11 +153,9 @@ class IndexArchiveBundle implements CrawlConstants
                 serialize($this->generation_info));
         }
         $this->summaries = new WebArchiveBundle($dir_name."/summaries",
-            $filter_size, $num_partitions_summaries, $description);
+            $filter_size, -1, $description);
         $this->summaries->initCountIfNotExists("VISITED_URLS_COUNT");

-        $this->num_partitions_summaries = $this->summaries->num_partitions;
-
         $this->description = $this->summaries->description;

         $this->num_docs_per_generation = $num_docs_per_generation;
@@ -170,36 +164,52 @@ class IndexArchiveBundle implements CrawlConstants

     /**
      * Add the array of $pages to the summaries WebArchiveBundle pages being
-     * stored in the partition according to the $key_field and the field used
+     * stored in the partition $generation and the field used
      * to store the resulting offsets given by $offset_field.
      *
-     * @param string $key_field field used to select partition
+     * @param int $generation field used to select partition
      * @param string $offset_field field used to record offsets after storing
      * @param array &$pages data to store
      * @param int $visited_urls_count number to add to the count of visited urls
      *      (visited urls is a smaller number than the total count of objects
      *      stored in the index).
-     * @return array $pages adjusted with offset field
      */
-    public function addPages($key_field, $offset_field, $pages,
+    public function addPages($generation, $offset_field, &$pages,
         $visited_urls_count)
     {
-        $result = $this->summaries->addPages($key_field, $offset_field, $pages);
+        $this->summaries->setWritePartition($generation);
+        $this->summaries->addPages($offset_field, $pages);
         $this->summaries->addCount($visited_urls_count, "VISITED_URLS_COUNT");
-        return $result;
     }

     /**
      * Adds the provided mini inverted index data to the IndexArchiveBundle
      *
-     * @param array $index_data a mini inverted index of word_key=>doc data
+     * @param object &$index_shard a mini inverted index of word_key=>doc data
      *      to add to this IndexArchiveBundle
      */
-    public function addIndexData($index_shard)
+    public function addIndexData(&$index_shard)
     {

         crawlLog("**ADD INDEX DIAGNOSTIC INFO...");
         $start_time = microtime();
+
+        $this->getActiveShard()->appendIndexShard($index_shard);
+        crawlLog("Append Index Shard: Memory usage:".memory_get_usage() .
+          " Time: ".(changeInMicrotime($start_time)));
+    }
+
+    /**
+     * Determines based on its size, if index_shard should be added to
+     * the active generation or in a new generation should be started.
+     * If so, a new generation is started.
+     *
+     * @param object &$index_shard a mini inverted index of word_key=>doc data
+     * @return int the active generation after the check and possible change has
+     *      been performed
+     */
+    public function initGenerationToAdd(&$index_shard)
+    {
         $current_num_docs = $this->getActiveShard()->num_docs;
         $add_num_docs = $index_shard->num_docs;
         if($current_num_docs + $add_num_docs > $this->num_docs_per_generation){
@@ -217,9 +227,8 @@ class IndexArchiveBundle implements CrawlConstants
                 serialize($this->generation_info));
             crawlLog("Switch Shard time:".changeInMicrotime($switch_time));
         }
-        $this->getActiveShard()->appendIndexShard($index_shard);
-        crawlLog("Append Index Shard: Memory usage:".memory_get_usage() .
-          " Time: ".(changeInMicrotime($start_time)));
+
+        return $this->generation_info['ACTIVE'];
     }

     /**
@@ -291,17 +300,19 @@ class IndexArchiveBundle implements CrawlConstants

     /**
      * Gets the page out of the summaries WebArchiveBundle with the given
-     * key and offset
+     * offset and generation
      *
-     * The $key determines the partition WebArchive, the $offset give the
-     * byte offset within that archive.
-     * @param string $key hash to use to look up WebArchive partition
      * @param int $offset byte offset in partition of desired page
+     * @param int $generation which generation WebArchive to look up in
+     *      defaults to the same number as the current shard
      * @return array desired page
      */
-    public function getPage($key, $offset)
+    public function getPage($offset, $generation = -1)
     {
-        return $this->summaries->getPage($key, $offset);
+        if($generation == -1 ) {
+            $generation = $this->generation_info['CURRENT'];
+        }
+        return $this->summaries->getPage($offset, $generation);
     }


diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index 122e5d016..e574d7276 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -285,8 +285,7 @@ class GroupIterator extends IndexBundleIterator
                 foreach($doc_info[self::SUMMARY_OFFSET] as $offset_array) {
                     list($key, $summary_offset) = $offset_array;
                     $index = & $this->getIndex($key);
-                    $page = $index->getPage(
-                        $key, $summary_offset);
+                    $page = $index->getPage($summary_offset);
                     if(!isset($out_pages[$doc_key][self::SUMMARY])) {
                         $out_pages[$doc_key][self::SUMMARY] = $page;
                     } else if (isset($page[self::DESCRIPTION])) {
diff --git a/lib/index_bundle_iterators/index_bundle_iterator.php b/lib/index_bundle_iterators/index_bundle_iterator.php
index dbef9bc3b..ed5ef90c9 100644
--- a/lib/index_bundle_iterators/index_bundle_iterator.php
+++ b/lib/index_bundle_iterators/index_bundle_iterator.php
@@ -154,8 +154,7 @@ abstract class IndexBundleIterator implements CrawlConstants
                 $doc_info = $pages[$doc_key];
             }
             if(isset($doc_info[self::SUMMARY_OFFSET])) {
-                $page = $index->getPage(
-                    $doc_key, $doc_info[self::SUMMARY_OFFSET]);
+                $page = $index->getPage($doc_info[self::SUMMARY_OFFSET]);
                 $out_pages[$doc_key] = $doc_info;
                 $out_pages[$doc_key][self::SUMMARY] = $page;
             }
diff --git a/lib/web_archive_bundle.php b/lib/web_archive_bundle.php
index 25a2df175..76f6e4dd3 100755
--- a/lib/web_archive_bundle.php
+++ b/lib/web_archive_bundle.php
@@ -91,15 +91,15 @@ class WebArchiveBundle
      */
     var $page_exists_filter_bundle;
     /**
-     * Number of WebArchives in the WebArchiveBundle
+     * Total number of page objects stored by this WebArchiveBundle
      * @var int
      */
-    var $num_partitions;
+    var $count;
     /**
-     * Total number of page objects stored by this WebArchiveBundle
+     * The index of the partition to which new documents will be added
      * @var int
      */
-    var $count;
+    var $write_partition;
     /**
      * A short text name for this WebArchiveBundle
      * @var string
@@ -119,24 +119,27 @@ class WebArchiveBundle
      * @param string $dir_name folder name of the bundle
      * @param int $filter_size number of items that can be stored in
      *      a given BloomFilterFile in the $page_exists_filter_bundle
-     * @param int $num_partitions number of WebArchive's in this bundle
+     * @param int $num_docs_per_partition number of documents before the
+     *      web archive is changed
      * @param string $description a short text name/description of this
      *      WebArchiveBundle
      * @param string $compressor the Compressor object used to
      *      compress/uncompress data stored in the bundle
      */
     function __construct($dir_name, $filter_size = -1,
-        $num_partitions = NULL, $description = NULL,
+        $num_docs_per_partition = NUM_DOCS_PER_GENERATION, $description = NULL,
         $compressor = "GzipCompressor")
     {
         //filter size = -1 used by web server to not get all partitions created

         $this->dir_name = $dir_name;
         $this->filter_size = $filter_size;
+        $this->num_docs_per_partition = $num_docs_per_partition;
         $this->compressor = $compressor;
+        $this->write_partition = 0;

         $read_only_archive = false;
-        if($num_partitions == NULL) {
+        if($filter_size == -1) {
             $read_only_archive = true;
         }

@@ -151,16 +154,18 @@ class WebArchiveBundle
                 file_get_contents($this->dir_name."/description.txt"));
         }

-        $this->num_partitions = $num_partitions;
-        if(isset($info['NUM_PARTITIONS'])) {
-            $this->num_partitions = $info['NUM_PARTITIONS'];
+        if(isset($info['NUM_DOCS_PER_PARTITION'])) {
+            $this->num_docs_per_partition = $info['NUM_DOCS_PER_PARTITION'];
         }

         $this->count = 0;
         if(isset($info['COUNT'])) {
             $this->count = $info['COUNT'];
         }
-
+
+        if(isset($info['WRITE_PARTITION'])) {
+            $this->write_partition = $info['WRITE_PARTITION'];
+        }
         if(isset($info['DESCRIPTION']) ) {
             $this->description = $info['DESCRIPTION'];
         } else {
@@ -171,8 +176,9 @@ class WebArchiveBundle
         }

         $info['DESCRIPTION'] = $this->description;
-        $info['NUM_PARTITIONS'] = $this->num_partitions;
+        $info['NUM_DOCS_PER_PARTITION'] = $this->num_docs_per_partition;
         $info['COUNT'] = $this->count;
+        $info['WRITE_PARTITION'] = $this->write_partition;
         if(!$read_only_archive) {
             file_put_contents(
                 $this->dir_name."/description.txt", serialize($info));
@@ -191,80 +197,61 @@ class WebArchiveBundle

     /**
      * Add the array of $pages to the WebArchiveBundle pages being stored in
-     * the partition according to the $key_field and the field used to store
+     * the partition according to write partition and the field used to store
      * the resulting offsets given by $offset_field.
      *
-     * @param string $key_field field used to select partition
      * @param string $offset_field field used to record offsets after storing
      * @param array &$pages data to store
-     * @return array $pages adjusted with offset field
+     * @return int the write_partition the pages were stored in
      */
-    function addPages($key_field, $offset_field, &$pages)
+    function addPages($offset_field, &$pages)
     {
-        $partition_queue = array();
-        for($i = 0; $i < $this->num_partitions; $i++) {
-            $partition_queue[$i] = array();
-        }

         $num_pages = count($pages);
-        for($i = 0; $i < $num_pages; $i++) {
-            //we are doing this to preserve the order of the returned array
-            $pages[$i]['TMP_INDEX'] = $i;
-        }
-
-        foreach($pages as $page) {
-            if(isset($page[$key_field])) {
-                $this->count++;

-                $index = WebArchiveBundle::selectPartition(
-                    $page[$key_field], $this->num_partitions);
-
-                $partition_queue[$index][] =  $page;
-            }
+        if($this->num_docs_per_partition > 0 &&
+            $num_pages > $this->num_docs_per_partition) {
+            crawlLog("ERROR! At most ".$this->num_docs_per_partition.
+                "many pages can be added in one go!");
+            exit();
         }

-        $pages_with_offsets = array();
-        for($i = 0; $i < $this->num_partitions; $i++) {
-            $pages_with_offsets = array_merge($pages_with_offsets,
-                $this->addObjectsPartition(
-                    $offset_field, $i, $partition_queue[$i]));
+        $partition = $this->getPartition($this->write_partition);
+        $part_count = $partition->count;
+        if($this->num_docs_per_partition > 0 &&
+            $num_pages + $part_count > $this->num_docs_per_partition) {
+            $this->setWritePartition($this->writePartition + 1);
+            $partition = $this->getPartition($this->write_partition);
         }

-        foreach($pages_with_offsets as $off_page) {
-            $pages[$off_page['TMP_INDEX']][$offset_field] =
-                $off_page[$offset_field];
-            unset($pages[$off_page['TMP_INDEX']]['TMP_INDEX'] );
-        }
-        return $pages;
+        $this->addCount($num_pages); //only adds to count on disk
+        $this->count += $num_pages;
+
+        $partition->addObjects($offset_field, $pages, NULL, NULL, false);
+
+        return $this->write_partition;
     }

     /**
-     * Gets the page out of the WebArchiveBundle with the given key and offset
-     *
-     * The $key determines the partition WebArchive, the $offset give the
-     * byte offset within that archive.
-     * @param string $key hash to use to look up WebArchive partition
-     * @param int $offset byte offset in partition of desired page
-     * @return array desired page
+     * Advances the index of the write partition by one and creates the
+     * corresponding web archive.
      */
-    function getPage($key, $offset)
+    function setWritePartition($i)
     {
-        $partition =
-            WebArchiveBundle::selectPartition($key, $this->num_partitions);
-
-        return $this->getPageByPartition($partition, $offset);
+        $this->write_partition = $i;
+        $this->getPartition($this->write_partition);
     }

     /**
      * Gets a page using in WebArchive $partition using the provided byte
      * $offset and using existing $file_handle if possible.
      *
-     * @param int $partition which WebArchive to look in
      * @param int $offset byte offset of page data
+     * @param int $partition which WebArchive to look in
      * @param resource $file_handle file handle resource of $partition archive
      * @return array desired page
      */
-    function getPageByPartition($partition, $offset, $file_handle = NULL)
+    function getPage($offset, $partition, $file_handle = NULL)
     {
         $page_array =
             $this->getPartition($partition)->getObjects(
@@ -294,51 +281,6 @@ class WebArchiveBundle
         }
     }

-    /**
-     * Adds a list of objects to a given WebArchive partition
-     *
-     * @param string $offset_field field used to store offsets after the
-     *      addition
-     * @param int $partition WebArchive index to store data into
-     * @param array &$objects objects to store
-     * @param array $data info header data to write
-     * @param string $callback function name of function to call as each
-     *      object is stored. Can be used to save offset into $data
-     * @param bool $return_flag whether to return modified $objects or not
-     * @return mixed adjusted objects or void
-     */
-    function addObjectsPartition($offset_field, $partition,
-        &$objects, $data = NULL, $callback = NULL, $return_flag = true)
-    {
-        $num_objects = count($objects);
-        $this->addCount($num_objects);
-
-        return $this->getPartition($partition)->addObjects(
-            $offset_field, $objects, $data, $callback, $return_flag);
-    }
-
-    /**
-     * Reads the info block of $partition WebArchive
-     *
-     * @param int $partition WebArchive to read from
-     * @return array data in its info block
-     */
-    function readPartitionInfoBlock($partition)
-    {
-        return $this->getPartition($partition)->readInfoBlock();
-    }
-
-    /**
-     * Write $data into the info block of the $partition WebArchive
-     *
-     * @param int $partition WebArchive to write into
-     * @param array $data what to write
-     */
-    function writePartitionInfoBlock($partition, &$data)
-    {
-        $this->getPartition($partition)->writeInfoBlock(NULL, $data);
-    }
-
     /**
      * Looks at the $key_field key of elements of pages and computes an array
      * consisting of $key_field values which are not in
@@ -387,6 +329,8 @@ class WebArchiveBundle
         }
     }

+
+
     /**
      * Gets an object encapsulating the $index the WebArchive partition in
      * this bundle.
@@ -412,7 +356,6 @@ class WebArchiveBundle
                 chmod($this->dir_name."/web_archive_".$index, 0777);
             }
         }
-
         return $this->partition[$index];
     }

@@ -466,7 +409,7 @@ class WebArchiveBundle
             $info['DESCRIPTION'] =
                 "Archive does not exist OR Archive description file not found";
             $info['COUNT'] = 0;
-            $info['NUM_PARTITIONS'] = 0;
+            $info['NUM_DOCS_PER_PARTITION'] = -1;
             return $info;
         }

@@ -476,26 +419,5 @@ class WebArchiveBundle

     }

-    /**
-     * Hashes $value to a WebArchive partition  it should be read/written to,
-     * if a bundle has $num_partitions partitions.
-     *
-     * @param string $value item to hash
-     * @param int $num_partitions number of partitions
-     * @return int which partition $value should be written to/read from
-     */
-    static function selectPartition($value, $num_partitions)
-    {
-
-        $hash = substr(md5($value, true), 0, 4);
-        $int_array = unpack("N", $hash);
-        $seed = $int_array[1];
-
-        mt_srand($seed);
-        $index = mt_rand(0, $num_partitions - 1);
-
-        return $index;
-
-    }
 }
 ?>
diff --git a/models/crawl_model.php b/models/crawl_model.php
index 8eedcf2e2..f01cec47f 100755
--- a/models/crawl_model.php
+++ b/models/crawl_model.php
@@ -75,21 +75,22 @@ class CrawlModel extends Model implements CrawlConstants


     /**
-     * Get a summary of a document by it document id (a string hash value)
-     * and its offset
+     * Get a summary of a document by the generation it is in
+     * and its offset into the corresponding WebArchive.
      *
-     * @param string $ukey document id hash string
-     * @param int $summary_offset offset into a partition in a WebArchiveBundle
+     * @param int $summary_offset offset in $generation WebArchive
+     * @param int $generation the index of the WebArchive in the
+     *      IndexArchiveBundle to find the item in.
      * @return array summary data of the matching document
      */
-    function getCrawlItem($ukey, $summary_offset)
+    function getCrawlItem($summary_offset, $generation)
     {
         $index_archive_name = self::index_data_base_name . $this->index_name;

         $index_archive =
             new IndexArchiveBundle(CRAWL_DIR.'/cache/'.$index_archive_name);

-        $summary = $index_archive->getPage($ukey, $summary_offset);
+        $summary = $index_archive->getPage($summary_offset, $generation);

         return $summary;
     }
@@ -108,27 +109,29 @@ class CrawlModel extends Model implements CrawlConstants
      *      cached page lives on
      * @param string $machine_uri the path from document root on $machine where
      *      the yioop scripts live
-     * @param string $hash the hash that was used to represent the page in the
-     *       WebArchiveBundle
+     * @param int $partition the partition in the WebArchiveBundle the page is
+     *       in
      * @param int $offset the offset in bytes into the WebArchive partition in
      *      the WebArchiveBundle at which the cached page lives.
      * @param string $crawl_time the timestamp of the crawl the cache page is
      *      from
      * @return array page data of the cached page
      */
-    function getCacheFile($machine, $machine_uri, $hash, $offset, $crawl_time)
+    function getCacheFile($machine, $machine_uri, $partition,
+        $offset, $crawl_time)
     {
         $time = time();
         $session = md5($time . AUTH_KEY);
-        if($machine == '::1') {
-            $machine = "localhost";
+        if($machine == '::1') { //IPv6 :(
+            $machine = "[::1]/";
             //used if the fetching and queue serving were on the same machine
         }

         $request= "http://$machine$machine_uri?c=archive&a=cache&time=$time".
-            "&session=$session&hash=$hash&offset=$offset".
+            "&session=$session&partition=$partition&offset=$offset".
             "&crawl_time=$crawl_time";
-        $page = @unserialize(base64_decode(FetchUrl::getPage($request)));
+        $tmp = FetchUrl::getPage($request);
+        $page = @unserialize(base64_decode($tmp));
         $page['REQUEST'] = $request;

         return $page;
@@ -196,7 +199,9 @@ class CrawlModel extends Model implements CrawlConstants
                     isset($info['VISITED_URLS_COUNT']) ?
                     $info['VISITED_URLS_COUNT'] : 0;
                 $crawl['COUNT'] = $info['COUNT'];
-                $crawl['NUM_PARTITIONS'] = $info['NUM_PARTITIONS'];
+                $crawl['NUM_DOCS_PER_PARTITION'] =
+                    $info['NUM_DOCS_PER_PARTITION'];
+                $crawl['WRITE_PARTITION'] = $info['WRITE_PARTITION'];
                 $list[] = $crawl;
             }
         }
diff --git a/models/phrase_model.php b/models/phrase_model.php
index c822caba3..151aef13c 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -139,33 +139,38 @@ class PhraseModel extends Model

     /**
      * Determines the offset into the summaries WebArchiveBundle of the
-     * provided url so that it can be retrieved. This relies on the
-     * fact that the info:url meta word has been stored.
+     * provided url so that the info:url summary can be retrieved.
+     * This assumes of course that  the info:url meta word has been stored.
      *
      * @param string $url what to lookup
-     * @return int offset into the web archive bundle
+     * @return array (offset, generation) into the web archive bundle
      */
-    function lookupSummaryOffset($url)
+    function lookupSummaryOffsetGeneration($url)
     {
         $index_archive_name = self::index_data_base_name . $this->index_name;
         $index_archive = new IndexArchiveBundle(
             CRAWL_DIR.'/cache/'.$index_archive_name);
-        $word_iterator =
-            new WordIterator(crawlHash("info:$url"), $index_archive);
         $num_retrieved = 0;
         $pages = array();
         $summary_offset = NULL;
-        while(is_array($next_docs = $word_iterator->nextDocsWithWord()) &&
-            $num_retrieved < 1) {
-             foreach($next_docs as $doc_key => $doc_info) {
-                 $summary_offset = & $doc_info[CrawlConstants::SUMMARY_OFFSET];
-                 $num_retrieved++;
-                 if($num_retrieved >=  1) {
-                     break 2;
+        $num_generations = $index_archive->generation_info['ACTIVE'];
+        for($i = 0; $i <= $num_generations && $num_retrieved < 1; $i++) {
+            $index_archive->setCurrentShard($i);
+            $word_iterator =
+                new WordIterator(crawlHash("info:$url"), $index_archive);
+            while(is_array($next_docs = $word_iterator->nextDocsWithWord()) &&
+                $num_retrieved < 1) {
+                 foreach($next_docs as $doc_key => $doc_info) {
+                     $summary_offset = &
+                        $doc_info[CrawlConstants::SUMMARY_OFFSET];
+                     $num_retrieved++;
+                     if($num_retrieved >=  1) {
+                         break 3;
+                     }
                  }
-             }
+            }
         }
-        return $summary_offset;
+        return array($summary_offset, $i);
     }

     /**
@@ -419,6 +424,7 @@ class PhraseModel extends Model
             if($generation > $index_archive->generation_info['ACTIVE']) {
                 continue;
             }
+
             $index_archive->setCurrentShard($generation);
             $weight = $word_struct["WEIGHT"];
             $num_word_keys = count($word_keys);
diff --git a/tests/bst_array_test.php b/tests/bst_array_test.php
deleted file mode 100644
index 5c545e4d2..000000000
--- a/tests/bst_array_test.php
+++ /dev/null
@@ -1,103 +0,0 @@
-<?php
-/**
- *  SeekQuarry/Yioop --
- *  Open Source Pure PHP Search Engine, Crawler, and Indexer
- *
- *  Copyright (C) 2009, 2010  Chris Pollett chris@pollett.org
- *
- *  LICENSE:
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  END LICENSE
- *
- * @author Chris Pollett chris@pollett.org
- * @package seek_quarry
- * @subpackage test
- * @license http://www.gnu.org/licenses/ GPL3
- * @link http://www.seekquarry.com/
- * @copyright 2009, 2010
- * @filesource
- */
-
-if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
-
-/**
- *  Load the string_array library we'll be testing
- */
-require_once BASE_DIR."/lib/bst_array.php";
-
-/**
- *  Used to test that the BSTArray class properly stores/retrieves values,
- *
- *  @author Chris Pollett
- *  @package seek_quarry
- *  @subpackage test
- */
-class BSTArrayTest extends UnitTest
-{
-    /**
-     * We'll use two different tables one more representative of how the table
-     * is going to be used by the web_queue_bundle, the other small enough that
-     * we can manually figure out what the result should be
-     */
-    public function setUp()
-    {
-        $this->test_objects['BST'] = new BSTArray(1, 1, "strcmp");
-    }
-
-    /**
-     */
-    public function tearDown()
-    {
-        unset($this->test_objects['BST']);
-    }
-
-    /**
-     * Check if can put objects into BST array and retrieve them
-     */
-    public function insertTestCase()
-    {
-        $this->test_objects['BST']->insertUpdate(chr(65), chr(66));
-        $flag = $this->test_objects['BST']->contains(chr(65), $offset, $parent);
-        $this->assertTrue($flag, "BST contains what was just inserted");
-        $this->test_objects['BST']->insertUpdate(chr(67), chr(68));
-        $flag = $this->test_objects['BST']->contains(chr(67), $offset, $parent);
-        $this->assertTrue($flag, "BST contains second insert");
-        $this->test_objects['BST']->insertUpdate(chr(66), chr(69));
-        $flag = $this->test_objects['BST']->contains(chr(66), $offset, $parent);
-        $this->assertTrue($flag, "BST contains third insert");
-        $this->test_objects['BST']->insertUpdate(chr(69), chr(69));
-        $flag = $this->test_objects['BST']->contains(chr(69), $offset, $parent);
-        $this->assertTrue($flag, "BST contains fourth insert");
-    }
-
-    /**
-     * Check if can modify objects in BST array
-     */
-    public function updateTestCase()
-    {
-        $this->test_objects['BST']->insertUpdate(chr(65), chr(66));
-        $this->test_objects['BST']->insertUpdate(chr(67), chr(68));
-        $this->test_objects['BST']->insertUpdate(chr(66), chr(69));
-        $this->test_objects['BST']->insertUpdate(chr(69), chr(69));
-        $this->test_objects['BST']->insertUpdate(chr(66), chr(66));
-        $this->test_objects['BST']->contains(chr(66), $offset, $parent);
-        list($key, $value, $left, $right) = $this->test_objects['BST']->
-            readEntry($offset);
-        $this->assertEqual($value, chr(66), "BST contains fourth insert");
-    }
-
-}
-?>
diff --git a/views/search_view.php b/views/search_view.php
index b0f5b1472..aaa5f6f4f 100755
--- a/views/search_view.php
+++ b/views/search_view.php
@@ -113,7 +113,7 @@ class SearchView extends View implements CrawlConstants
                     $data['TOTAL_ROWS']));
             ?> )</h2>
             <?php
-            foreach($data['PAGES'] as $page) {?>
+            foreach($data['PAGES'] as $page) {?>
                 <div class='result'>
                 <h2>
                 <a href="<?php if($page[self::TYPE] != "link") {
@@ -148,6 +148,7 @@ class SearchView extends View implements CrawlConstants
                         e($data['QUERY']); ?>&amp;arg=<?php
                         e(urlencode($page[self::URL]));
                         ?>&amp;so=<?php  e($page[self::SUMMARY_OFFSET]);
+                        ?>&amp;g=<?php  e($page[self::CACHE_PAGE_PARTITION]);
                         ?>&amp;its=<?php e($data['its']); ?>" >
                     <?php
                     if($page[self::TYPE] == "text/html" ||
ViewGit