Implements Issue 18, updates the sqlite default.db, a=chris

Chris Pollett [2010-11-27 23:Nov:th]
Implements Issue 18, updates the sqlite default.db, a=chris
Filename
INSTALL
README
lib/index_archive_bundle.php
lib/index_bundle_iterators/group_iterator.php
lib/index_bundle_iterators/index_bundle_iterator.php
lib/index_bundle_iterators/intersect_iterator.php
lib/index_bundle_iterators/phrase_filter_iterator.php
lib/index_bundle_iterators/union_iterator.php
lib/index_bundle_iterators/word_iterator.php
lib/index_shard.php
lib/phrase_parser.php
lib/utility.php
models/phrase_model.php
tests/index_shard_test.php
diff --git a/INSTALL b/INSTALL
index c48840026..ab3303363 100755
--- a/INSTALL
+++ b/INSTALL
@@ -52,10 +52,7 @@ php queue_server.php terminal
 from the bin folder.

 (3) You need at least one fetcher running
-to download pages. You can set fetchers up either on the
-same machine or on other machines. The QUEUE_SERVER
-in config.php says the url of the server to get fetch batches
-from. To run a fetcher, simply type:
+to download pages.To run a fetcher, simply type:

 php fetcher.php terminal

diff --git a/README b/README
index 727bb3d1c..696efc756 100755
--- a/README
+++ b/README
@@ -58,7 +58,9 @@ Credits
 ------
 Source code due to Chris Pollett. Several people helped
 with localization: Mary Pollett, Youn Kim, Sugi Widjaja,
-Chao-Hsin Shih, Sujata Dongre, Jonathan Ben-David
+Chao-Hsin Shih, Sujata Dongre, Jonathan Ben-David.
+Thanks to Ravi Dhillon for finding and helping with the
+fixes for Issue 15 and Commit 632e46.

 Installation
 -------------
diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php
index 9f5462532..fdaa1fe6e 100644
--- a/lib/index_archive_bundle.php
+++ b/lib/index_archive_bundle.php
@@ -379,7 +379,6 @@ class IndexArchiveBundle implements CrawlConstants
     {
         $words_array = array();
         if(!is_array($word_keys) || count($word_keys) < 1) { return NULL;}
-
         foreach($word_keys as $word_key) {
             $tmp = $this->getCurrentShard()->getWordInfo($word_key);
             if($tmp === false) {
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index e574d7276..93b49211c 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -102,6 +102,8 @@ class GroupIterator extends IndexBundleIterator
     {
         $this->index_bundle_iterator = $index_bundle_iterator;
         $this->num_docs = $this->index_bundle_iterator->num_docs;
+        $this->results_per_block =
+            $this->index_bundle_iterator->results_per_block;
         $this->reset();
     }

@@ -303,8 +305,10 @@ class GroupIterator extends IndexBundleIterator

     /**
      * Forwards the iterator one group of docs
+     * @param $doc_offset if set the next block must all have $doc_offsets
+     *      larger than or equal to this value
      */
-    function advance()
+    function advance($doc_offset = null)
     {
         $this->advanceSeenDocs();

@@ -323,10 +327,21 @@ class GroupIterator extends IndexBundleIterator
             $this->grouped_keys[$hash_url] = true;
         }

-        $this->index_bundle_iterator->advance();
+        $this->index_bundle_iterator->advance($doc_offset);

     }

+    /**
+     * Gets the doc_offset for the next document that would be return by
+     * this iterator
+     *
+     * @return int the desired document offset
+     */
+    function currentDocOffsetWithWord() {
+        $this->index_bundle_iterator->currentDocOffsetWithWord();
+    }
+
+
     /**
      * Returns the index associated with this iterator
      * @return &object the index
diff --git a/lib/index_bundle_iterators/index_bundle_iterator.php b/lib/index_bundle_iterators/index_bundle_iterator.php
index 5ab90e609..3878121d7 100644
--- a/lib/index_bundle_iterators/index_bundle_iterator.php
+++ b/lib/index_bundle_iterators/index_bundle_iterator.php
@@ -69,14 +69,24 @@ abstract class IndexBundleIterator implements CrawlConstants
      * @var array
      */
     var $pages;
-
+
     /**
      * Says whether the value in $this->count_block is up to date
      * @var bool
      */
     var $current_block_fresh;

+    /**
+     * Number of documents returned for each block (at most)
+     * @var int
+     */
+    var $results_per_block = self::RESULTS_PER_BLOCK;

+    /**
+     *  Default number of documents returned for each block (at most)
+     * @var int
+     */
+    const RESULTS_PER_BLOCK = 100;

     /**
      * Returns the iterators to the first document block that it could iterate
@@ -86,14 +96,25 @@ abstract class IndexBundleIterator implements CrawlConstants

     /**
      * Forwards the iterator one group of docs
+     * @param $doc_index if set the next block must all have $doc_indexes larger
+     *      than this value
      */
-    abstract function advance();
+    abstract function advance($doc_index = null);
+
     /**
      * Returns the index associated with this iterator
      * @return object the index
      */
     abstract function &getIndex($key = NULL);

+    /**
+     * Gets the doc_offset for the next document that would be return by
+     * this iterator
+     *
+     * @return int the desired document offset
+     */
+    abstract function currentDocOffsetWithWord();
+
     /**
      * Hook function used by currentDocsWithWord to return the current block
      * of docs if it is not cached
@@ -164,11 +185,14 @@ abstract class IndexBundleIterator implements CrawlConstants

     /**
      * Get the current block of doc summaries for the word iterator and advances
-     * the current pointer to the next block
+     * the current pointer to the next blockof documents. If a doc index is
+     * the next block must be of docs after this doc_index
      *
+     * @param $doc_offset if set the next block must all have $doc_offsets
+     *      equal to or larger than this value
      * @return array doc summaries matching the $this->restrict_phrases
      */
-    function nextDocsWithWord()
+    function nextDocsWithWord($doc_offset = null)
     {
         $doc_block = $this->getSummariesFromCurrentDocs();

@@ -176,7 +200,7 @@ abstract class IndexBundleIterator implements CrawlConstants
             return NULL;
         }

-        $this->advance();
+        $this->advance($doc_offset);

         return $doc_block;

@@ -198,5 +222,16 @@ abstract class IndexBundleIterator implements CrawlConstants
         $this->seen_docs += $this->count_block;
     }

+    /**
+     * Sets the value of the result_per_block field. This field controls
+     * the maximum number of results that can be returned in one go by
+     * currentDocsWithWord()
+     *
+     * @param int $num the maximum number of results that can be returned by
+     *      a block
+     */
+     function setResultsPerBlock($num) {
+        $this->results_per_block = $num;
+     }
 }
 ?>
diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php
index d498461dd..62f36b691 100644
--- a/lib/index_bundle_iterators/intersect_iterator.php
+++ b/lib/index_bundle_iterators/intersect_iterator.php
@@ -66,12 +66,6 @@ class IntersectIterator extends IndexBundleIterator
      */
     var $num_iterators;

-    /**
-     * The number of documents in the current block before filtering
-     * by restricted words
-     * @var int
-     */
-    var $count_block_unfiltered;
     /**
      * The number of documents in the current block after filtering
      * by restricted words
@@ -104,16 +98,20 @@ class IntersectIterator extends IndexBundleIterator

         $this->num_iterators = count($index_bundle_iterators);
         $this->num_docs = -1;
+        $this->results_per_block = 1;

         /*
              the most results we can return is the size of the least num_docs
-             of what we are itrerating over
+             of what we are iterating over. We are also setting up here
+             that we return at most one posting at a time from each
+             iterator
         */
         for($i = 0; $i < $this->num_iterators; $i++) {
             if( $this->num_docs < 0 ||
                 $this->index_bundle_iterators[$i]->num_docs < $this->num_docs) {
                 $this->num_docs = $this->index_bundle_iterators[$i]->num_docs;
             }
+            $this->index_bundle_iterators[$i]->setResultsPerBlock(1);
         }
         $this->reset();
     }
@@ -130,7 +128,6 @@ class IntersectIterator extends IndexBundleIterator

         $this->seen_docs = 0;
         $this->seen_docs_unfiltered = 0;
-        $doc_block = $this->currentDocsWithWord();

     }

@@ -143,134 +140,89 @@ class IntersectIterator extends IndexBundleIterator
     function findDocsWithWord()
     {
         $pages = array();
-        $high_ranks = array();
-        $last = $this->num_iterators - 1;
-        for($i = 0; $i < $this->num_iterators; $i++) {
-            $pages[$i] =
-                $this->index_bundle_iterators[$i]->currentDocsWithWord();
-            if(!is_array($pages[$i]) || count($pages[$i]) == 0) {
-                $this->to_advance_index = $i;
-                return $pages[$i];
-            }
-            list($low_ranks[$i], $high_ranks[$i]) =
-                $this->lowHighRanks($pages[$i], $i);
-        }
-        uasort($low_ranks, "docRankOrderCallback");
-
-       $low_ranks = array_values($low_ranks);
-
-       $low_rank = $low_ranks[$last][self::DOC_RANK];
-
-       $this->to_advance_index = $low_ranks[0]["INDEX"];
-       $this->count_block_unfiltered = count($pages[$this->to_advance_index]);
-
-        $docs = array();
-        $looping = true;
-
-        while ($looping == true) {
-            for($i = 0; $i <= $last; $i++) {
-            list( ,$high_ranks[$i]) =
-                $this->lowHighRanks($pages[$i], $i, false);
-            }
-            $broke = false;
-            $score = 0;
-            $high_rank = $high_ranks[0][self::DOC_RANK];
-            $high_key = $high_ranks[0]["KEY"];
-            $high_index = $high_ranks[0]["INDEX"];
-            $to_deletes = array();
-            for($i = 1; $i <= $last; $i++) {
-                if($high_ranks[$i][self::DOC_RANK] < $low_rank ) {
-                    $looping = false;
-                    break 2;
-                }
-                if($high_ranks[$i][self::DOC_RANK] > $high_rank ||
-                    ($high_ranks[$i][self::DOC_RANK] == $high_rank &&
-                        strcmp($high_ranks[$i]["KEY"], $high_key) > 0)
-                    ) {
-                    $broke = true;
-                    $high_rank = $high_ranks[$i][self::DOC_RANK];
-                    $high_index = $high_ranks[$i]["INDEX"];
-                    $high_key = $high_ranks[$i]["KEY"];
-                    $to_deletes[$high_index] = $high_key;
-                }
-                $score += $high_ranks[$i][self::SCORE];
-            }
-            if($broke == false) {
-                $docs[$high_key] = $pages[$high_index][$high_key];
-                $docs[$high_key][self::SCORE] = $score;
-                $to_deletes[$high_index] = $high_key;
-            }
-
-            foreach($to_deletes as $index => $key) {
-                unset($pages[$index][$key]);
-                if(count($pages[$index]) == 0) {
-                    $looping = false;
-                }
-            }

+        $status = $this->syncDocOffsetsAmongstIterators();
+        if($status == -1) {
+            return -1;
         }
+        $docs = $this->index_bundle_iterators[0]->currentDocsWithWord();
         $this->count_block = count($docs);
         $this->pages = $docs;
         return $docs;
     }

     /**
-     * Given a collection of documents, returns info about the low and high
-     * ranking documents. Namely, their ranks, keys,
-     * index in word iterator array, and scores
      *
-     * @param array &$docs documents to get low high info from
-     * @param int $index which word iterator these docs came from
-     * @param boo $sort_flag whether to sort the docs (if true) or to assume
-     *      the docs are already sorted by rank
-     * @return array desired info
      */
-    function lowHighRanks(&$docs, $index, $sort_flag = true)
+    function syncDocOffsetsAmongstIterators()
     {
-        if($sort_flag == true) {
-            uasort($docs, "docRankOrderCallback");
-        }
-        reset($docs);
-        $high = array();
-        $high["KEY"] = key($docs);
-        $high[self::DOC_RANK] = $docs[$high["KEY"]][self::DOC_RANK];
-        $high[self::SCORE] = $docs[$high["KEY"]][self::SCORE];
-        $high["INDEX"] = $index;
-        end($docs);
-        $low = array();
-        $low["KEY"] = key($docs);
-        $low[self::DOC_RANK] =  $docs[$low["KEY"]][self::DOC_RANK];
-        $low[self::SCORE] =  $docs[$low["KEY"]][self::SCORE];
-        $low["INDEX"] = $index;
-        return array($low, $high);
+        $biggest_offset = 0;
+        $all_same = true;
+        do{
+            for($i = 0; $i < $this->num_iterators; $i++) {
+                $new_doc_offset =
+                    $this->index_bundle_iterators[$i]->currentDocOffsetWithWord();
+                if($i == 0) {
+                    $biggest_offset = $new_doc_offset;
+                }
+                if($new_doc_offset == -1) {
+                    return -1;
+                }
+                if($new_doc_offset > $biggest_offset) {
+                    $biggest_offset = $new_doc_offset;
+                    $all_same = false;
+                }
+            }
+            if($all_same) {
+                return 1;
+            }
+            for($i = 0; $i < $this->num_iterators; $i++) {
+                $this->index_bundle_iterators[$i]->advance($biggest_offset);
+            }
+        } while(!$all_same);
     }

     /**
      * Forwards the iterator one group of docs
+     * @param $doc_offset if set the next block must all have $doc_offsets
+     *      larger than or equal to this value
      */
-    function advance()
+    function advance($doc_offset = null)
     {
         $this->advanceSeenDocs();

-        	$this->seen_docs_unfiltered += $this->count_block_unfiltered;
+        $this->seen_docs_unfiltered = 0;

-        $min_num_docs = 10000000000;
+        //num_docs can change when advance() called so that's why we recompute
+        $total_num_docs = 0;
         for($i = 0; $i < $this->num_iterators; $i++) {
-            if($this->index_bundle_iterators[$i]->num_docs < $min_num_docs) {
-                $min_num_docs = $this->index_bundle_iterators[$i]->num_docs;
-            }
+             $this->seen_docs_unfiltered +=
+                $this->index_bundle_iterators[$i]->seen_docs;
+            $total_num_docs = $this->index_bundle_iterators[$i]->num_docs;
         }
         if($this->seen_docs_unfiltered > 0) {
             $this->num_docs =
-                floor(($this->seen_docs * $min_num_docs) /
+                floor(($this->seen_docs * $total_num_docs) /
                 $this->seen_docs_unfiltered);
         } else {
             $this->num_docs = 0;
         }
-        $this->index_bundle_iterators[$this->to_advance_index]->advance();
+
+        $this->index_bundle_iterators[0]->advance($doc_offset);

     }

+    /**
+     * Gets the doc_offset for the next document that would be return by
+     * this iterator
+     *
+     * @return int the desired document offset
+     */
+    function currentDocOffsetWithWord() {
+        $this->syncDocOffsetsAmongstIterators();
+        $this->index_bundle_iterators[0]->currentDocOffsetWithWord();
+    }
+
     /**
      * Returns the index associated with this iterator
      * @return object the index
@@ -279,5 +231,23 @@ class IntersectIterator extends IndexBundleIterator
     {
         return $this->index_bundle_iterators[0]->getIndex($key = NULL);
     }
+
+    /**
+     * This method is supposed to set
+     * the value of the result_per_block field. This field controls
+     * the maximum number of results that can be returned in one go by
+     * currentDocsWithWord(). This method cannot be consistently
+     * implemented for this iterator and expect it to behave nicely
+     * it this iterator is used together with union_iterator. So
+     * to prevent a user for doing this, calling this method results
+     * in a user defined error
+     *
+     * @param int $num the maximum number of results that can be returned by
+     *      a block
+     */
+     function setResultsPerBlock($num) {
+        trigger_error("Cannot set the results per block of
+            an intersect iterator", E_USER_ERROR);
+     }
 }
 ?>
diff --git a/lib/index_bundle_iterators/phrase_filter_iterator.php b/lib/index_bundle_iterators/phrase_filter_iterator.php
index fc7fa5fc6..36e492812 100644
--- a/lib/index_bundle_iterators/phrase_filter_iterator.php
+++ b/lib/index_bundle_iterators/phrase_filter_iterator.php
@@ -122,6 +122,8 @@ class PhraseFilterIterator extends IndexBundleIterator
         $this->restrict_phrases = $restrict_phrases;
         $this->disallow_phrases = $disallow_phrases;
         $this->num_docs = $this->index_bundle_iterator->num_docs;
+        $this->results_per_block =
+            $this->index_bundle_iterator->results_per_block;
         $this->weight = $weight;
         $this->current_block_fresh = false;
         $this->reset();
@@ -249,8 +251,11 @@ class PhraseFilterIterator extends IndexBundleIterator

     /**
      * Forwards the iterator one group of docs
+     * @param $doc_offset if set the next block must all have $doc_offsets
+     *      larger than or equal to this value
      */
-    function advance()
+    function advance($doc_offset = null)
+
     {
         $this->advanceSeenDocs();

@@ -267,7 +272,17 @@ class PhraseFilterIterator extends IndexBundleIterator
             $this->num_docs = 0;
         }

-        $this->index_bundle_iterator->advance();
+        $this->index_bundle_iterator->advance($doc_offset);
+    }
+
+    /**
+     * Gets the doc_offset for the next document that would be return by
+     * this iterator
+     *
+     * @return int the desired document offset
+     */
+    function currentDocOffsetWithWord() {
+        $this->index_bundle_iterator->currentDocOffsetWithWord();
     }

     /**
diff --git a/lib/index_bundle_iterators/union_iterator.php b/lib/index_bundle_iterators/union_iterator.php
index 3e5916f5e..a5cb0b6bd 100644
--- a/lib/index_bundle_iterators/union_iterator.php
+++ b/lib/index_bundle_iterators/union_iterator.php
@@ -102,8 +102,16 @@ class UnionIterator extends IndexBundleIterator
         */
         $this->num_iterators = count($index_bundle_iterators);
         $this->num_docs = 0;
+        $this->results_per_block = 0;
         for($i = 0; $i < $this->num_iterators; $i++) {
             $this->num_docs += $this->index_bundle_iterators[$i]->num_docs;
+            /*
+                result_per_block is at most the sum of
+                results_per_block of things we are iterating. Value
+                is already init'd in base class.
+             */
+            $this->results_per_block +=
+                $this->index_bundle_iterators[$i]->results_per_block;
         }
         $this->reset();
     }
@@ -195,8 +203,10 @@ class UnionIterator extends IndexBundleIterator

     /**
      * Forwards the iterator one group of docs
+     * @param $doc_offset if set the next block must all have $doc_offsets
+     *      larger than or equal to this value
      */
-    function advance()
+    function advance($doc_offset = null)
     {
         $this->advanceSeenDocs();

@@ -205,7 +215,7 @@ class UnionIterator extends IndexBundleIterator
         $total_num_docs = 0;
         for($i = 0; $i < $this->num_iterators; $i++) {
             $total_num_docs += $this->index_bundle_iterators[$i]->num_docs;
-            $this->index_bundle_iterators[$i]->advance();
+            $this->index_bundle_iterators[$i]->advance($doc_index);
         }
         if($this->seen_docs_unfiltered > 0) {
             $this->num_docs =
@@ -238,5 +248,38 @@ class UnionIterator extends IndexBundleIterator
             return $this->index_bundle_iterators[0]->getIndex($key);
         }
     }
+
+    /**
+     * This method is supposed to set
+     * the value of the result_per_block field. This field controls
+     * the maximum number of results that can be returned in one go by
+     * currentDocsWithWord(). This method cannot be consistently
+     * implemented for this iterator and expect it to behave nicely
+     * it this iterator is used together with intersect_iterator. So
+     * to prevent a user for doing this, calling this method results
+     * in a user defined error
+     *
+     * @param int $num the maximum number of results that can be returned by
+     *      a block
+     */
+     function setResultsPerBlock($num) {
+        trigger_error("Cannot set the results per block of
+            a union iterator", E_USER_ERROR);
+     }
+
+    /**
+     * This method is supposed to
+     * get the doc_offset for the next document that would be return by
+     * this iterator. As the union iterator as written returns a block
+     * of size at least the number of iterators in it, and this iterator
+     * is intended to be used when results_per_block is 1, we generate
+     * a user defined error.
+     *
+     * @return int the desired document offset
+     */
+    function currentDocOffsetWithWord() {
+        trigger_error("Cannot get the doc offset with word of
+            a union iterator", E_USER_ERROR);
+    }
 }
 ?>
diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php
index 9a00f8cfb..42db86f03 100644
--- a/lib/index_bundle_iterators/word_iterator.php
+++ b/lib/index_bundle_iterators/word_iterator.php
@@ -96,6 +96,12 @@ class WordIterator extends IndexBundleIterator
      */
     var $current_offset;

+    /**
+     * Starting Offset of word occurence in the IndexShard
+     * @var int
+     */
+    var $start_offset;
+
     /**
      * Last Offset of word occurence in the IndexShard
      * @var int
@@ -109,11 +115,8 @@ class WordIterator extends IndexBundleIterator
      */
     var $empty;

-    /**
-     *  Number of documents returned for each block (at most)
-     * @var int
-     */
-    const RESULTS_PER_BLOCK = 100;
+
+

     /**
      * Creates a word iterator with the given parameters.
@@ -127,15 +130,16 @@ class WordIterator extends IndexBundleIterator
     function __construct($word_key, $index, $raw = false)
     {
         $this->word_key = $word_key;
-
         $this->index =  $index;
         $this->current_block_fresh = false;
+
         $tmp = $index->getCurrentShard()->getWordInfo($word_key, $raw);
         if ($tmp === false) {
             $this->empty = true;
         } else {
-            list($this->current_offset, $this->last_offset, $this->num_docs)
+            list($this->start_offset, $this->last_offset, $this->num_docs)
                 = $tmp;
+            $this->current_offset = $this->start_offset;
             $this->empty = false;

             $this->reset();
@@ -167,27 +171,51 @@ class WordIterator extends IndexBundleIterator
             return -1;
         }
         $this->next_offset = $this->current_offset;
-        $results = $this->index->getCurrentShard()->getWordSlice(
-            $this->next_offset, $this->last_offset, self::RESULTS_PER_BLOCK);
+        $results = $this->index->getCurrentShard()->getPostingsSlice(
+            $this->next_offset, $this->last_offset, $this->results_per_block);
         return $results;
     }


     /**
      * Forwards the iterator one group of docs
+     * @param $doc_offset if set the next block must all have $doc_offsets
+     *      larger than or equal to this value
      */
-    function advance()
+    function advance($doc_offset = null)
     {
         $this->advanceSeenDocs();
         if($this->current_offset < $this->next_offset) {
             $this->current_offset = $this->next_offset;
+            if($doc_offset !== null) {
+                $this->current_offset =
+                    $this->index->getCurrentShard(
+                        )->nextPostingOffsetDocOffset($this->next_offset,
+                            $this->last_offset, $doc_offset);
+                $this->seen_docs =
+                    ($this->current_offset - $this->start_offset)/
+                        IndexShard::POSTING_LEN;
+            }
         } else {
             $this->current_offset = $this->last_offset + 1;
         }
-
+    }
+

+    /**
+     * Gets the doc_offset for the next document that would be return by
+     * this iterator
+     *
+     * @return int the desired document offset
+     */
+    function currentDocOffsetWithWord() {
+        if($this->current_offset > $this->last_offset) {
+            return -1;
+        }
+        return $this->index->getCurrentShard(
+                        )->docOffsetFromPostingOffset($this->current_offset);
     }
-
+
     /**
      * Returns the index associated with this iterator
      * @return &object the index
diff --git a/lib/index_shard.php b/lib/index_shard.php
index 40122dfa5..188fb45a1 100644
--- a/lib/index_shard.php
+++ b/lib/index_shard.php
@@ -96,39 +96,27 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      *
      * @var array
      */
-     var $firsts;
-
-    /**
-     *
-     * @var int
-     */
-     var $firsts_len;
-
-    /**
-     *
-     * @var array
-     */
-     var $seconds;
+    var $words;

     /**
+     * Stores length of the words array in the shard on disk. Only set if
+     * we're in $read_only_from_disk mode
      *
      * @var int
      */
-     var $seconds_len;
+     var $words_len;

     /**
      *
      * @var array
      */
-    var $words;
+    var $prefixes;

     /**
-     * Stores length of the words array in the shard on disk. Only set if
-     * we're in $read_only_from_disk mode
      *
      * @var int
      */
-     var $words_len;
+    var $prefixes_len;

     /**
      * This is supposed to hold the number of documents that have been stored
@@ -198,17 +186,17 @@ class IndexShard extends PersistentStructure implements CrawlConstants
     /**
      * Header Length of an IndexShard (sum of its non-variable length fields)
      */
-    const HEADER_LENGTH = 40;
+    const HEADER_LENGTH = 36;

     /**
      * Length of a Word entry in bytes in the shard
      */
-    const WORD_ITEM_LEN = 14;
+    const WORD_ITEM_LEN = 16;

     /**
-     * Length of a doc offset occurrence pair in a posting list
+     * Length of one posting ( a doc offset occurrence pair) in a posting list
      */
-    const DOC_OCCURRENCES_LEN = 4;
+    const POSTING_LEN = 4;

     /**
      * Makes an index shard with the given file name and generation offset
@@ -227,10 +215,6 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         parent::__construct($fname, -1);
         $this->generation_offset = $generation_offset;
         $this->word_docs = "";
-        $this->firsts_len = 0;
-        $this->firsts = array();
-        $this->seconds_len = 0;
-        $this->seconds = array();
         $this->words_len = 0;
         $this->word_docs_len = 0;
         $this->words = array();
@@ -290,16 +274,13 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         }
         foreach($word_counts as $word => $occurrences) {
             $word_id = crawlHash($word, true);
-            $first = $word_id[0];
-            $second = $word_id[1];
-            $rest_id = substr($word_id, 2);
             $occurrences = ($occurrences > 255 ) ? 255 : $occurrences & 255;
-            $store =  pack("N", ($this->docids_len << 4) + $occurrences);
-            if(!isset($this->words[$first][$second][$rest_id])) {
-                $this->words[$first][$second][$rest_id] = $store;
-            } else if($this->words[$first][$second][$rest_id] !=
+            $store =  $this->packPosting($this->docids_len >> 4, $occurrences);
+            if(!isset($this->words[$word_id])) {
+                $this->words[$word_id] = $store;
+            } else if($this->words[$word_id] !=
                 pack("N", self::DUPLICATE_FLAG)) {
-                $this->words[$first][$second][$rest_id] .= $store;
+                $this->words[$word_id] .= $store;
             }
             if($occurrences > 0) {
                 if($is_doc == true) {
@@ -308,7 +289,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
                     $link_doc_len += $occurrences;
                 }
             }
-            $this->word_docs_len += self::DOC_OCCURRENCES_LEN;
+            $this->word_docs_len += self::POSTING_LEN;
         }

         $this->len_all_docs += $doc_len;
@@ -333,7 +314,6 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      */
     function getWordInfo($word_id, $raw = false)
     {
-
         if($raw == false) {
             //get rid of out modfied base64 encoding
             $hash = str_replace("_", "/", $word_id);
@@ -359,14 +339,14 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      * the list stops if an offset larger than $last_offset is seen or
      * $len many doc's have been returned. Since $next_offset is passed by
      * reference the value of $next_offset will point to the next record in
-     * the list (if it exists) after thhe function is called.
+     * the list (if it exists) after the function is called.
      *
      * @param int &$next_offset where to start in word docs
      * @param int $last_offset offset at which to stop by
      * @param int $len number of documents desired
      * @return array desired list of doc's and their info
      */
-    function getWordSlice(&$next_offset, $last_offset, $len)
+    function getPostingsSlice(&$next_offset, $last_offset, $len)
     {
         if(!$this->read_only_from_disk && !$this->word_docs_packed) {
             $this->packWordDocs();
@@ -379,11 +359,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         do {
             if($next_offset >= $this->word_docs_len) {break;}
             $item = array();
-            $doc_string = $this->getWordDocsSubstring($next_offset, 4);
-            $tmp = unpack("N", $doc_string);
-            $doc_int = $tmp[1];
-            $occurrences = $doc_int & 255;
-            $doc_index = ($doc_int >> 8);
+            $posting = $this->getWordDocsSubstring($next_offset, 4);
+            list($doc_index, $occurrences) = $this->unpackPosting($posting);
             $old_next_offset = $next_offset;
             $next_offset += 4;
             $doc_depth = log(10*(($doc_index +1) +
@@ -444,26 +421,78 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         return $results;
     }

+    /**
+     *
+     */
+     function nextPostingOffsetDocOffset($start_offset, $end_offset,
+        $doc_offset) {
+
+        $doc_index = $doc_offset >> 4;
+        $current = floor($start_offset/self::POSTING_LEN);
+        $end = floor($end_offset/self::POSTING_LEN);
+        $low = $current;
+        $high = $end;
+        $stride = 1;
+        $gallop_phase = true;
+
+        do {
+            $posting = $this->getWordDocsSubstring($current*self::POSTING_LEN,
+                self::POSTING_LEN);
+            list($post_doc_index, ) = $this->unpackPosting($posting);
+            if($doc_index == $post_doc_index) {
+                return $current * self::POSTING_LEN;
+            } else if($doc_index < $post_doc_index) {
+                if($low == $current) {
+                    return $current * self::POSTING_LEN;
+                } else if($gallop_phase) {
+                    $gallop_phase = false;
+                }
+                $high = $current;
+                $current = (($low + $high) >> 1);
+            } else {
+                $low = $current;
+                if($gallop_phase) {
+                    $current += $stride;
+                    $stride <<= 1;
+                } else {
+                    if($current + 1 == $high) {
+                        $current++;
+                        $low = $current;
+                    }
+                    $current = (($low + $high) >> 1);
+                }
+            }
+
+        } while($current <= $end);
+
+        return false;
+     }
+
+    /**
+     *
+     */
+    function docOffsetFromPostingOffset($offset) {
+        $posting = $this->getWordDocsSubstring($offset, self::POSTING_LEN);
+        list($doc_index, ) = $this->unpackPosting($posting);
+        return ($doc_index << 4);
+    }

     /**
      * Returns $len many documents which contained the word corresponding to
-     * $word_id
+     * $word_id (only wordk for loaded shards)
      *
      * @param string $word_id key to look up documents for
      * @param int number of documents desired back (from start of word linked
      *      list).
      * @return array desired list of doc's and their info
      */
-    function getWordSliceById($word_id, $len)
+    function getPostingsSliceById($word_id, $len)
     {
         $results = array();
-        $first = $word_id[0];
-        $second = $word_id[1];
-        $rest_id = substr($word_id, 2);
-        if(isset($this->words[$first][$second][$rest_id])) {
+        if(isset($this->words[$word_id])) {
             list($first_offset, $last_offset,
                 $num_docs_or_links) = $this->getWordInfo($word_id, true);
-            $results = $this->getWordSlice($first_offset, $last_offset, $len);
+            $results = $this->getPostingsSlice($first_offset, $last_offset, $len);
         }
         return $results;
     }
@@ -476,7 +505,6 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      */
     function appendIndexShard(&$index_shard)
     {
-
         if($this->word_docs_packed == true) {
             $this->unpackWordDocs();
         }
@@ -485,37 +513,32 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         }
         $this->doc_infos .= $index_shard->doc_infos;

-        foreach($index_shard->words as $first => $rest) {
-        foreach($rest as $second => $second_rest) {
-        foreach($second_rest as $rest_id => $postings) {
+        foreach($index_shard->words as $word_id => $postings) {
             $postings_len = strlen($postings);
             // update doc offsets for newly added docs
-            for($i = 0; $i < $postings_len; $i +=4) {
-                $doc_occurrences_string = substr($postings, $i, 4);
-                $tmp = unpack("N", $doc_occurrences_string);
-                $num = $tmp[1];
+            for($i = 0; $i < $postings_len; $i += self::POSTING_LEN) {
+                $num = $this->unpackInt(substr($postings, $i, 4));
                 if($num != self::DUPLICATE_FLAG) {
                     $num += ($this->docids_len << 4);
-                    $doc_occurrences_string = pack("N", $num);
-                    charCopy($doc_occurrences_string, $postings, $i, 4);
+                    charCopy(pack("N", $num), $postings, $i, 4);
                 }
             }
             $dup = pack("N", self::DUPLICATE_FLAG);
-            if(!isset($this->words[$first][$second][$rest_id])) {
-                $this->words[$first][$second][$rest_id] = $postings;
+            if(!isset($this->words[$word_id])) {
+                $this->words[$word_id] = $postings;
                 $this->word_docs_len += $postings_len;
-            } else if($this->words[$first][$second][$rest_id] == $dup
+            } else if($this->words[$word_id] == $dup
                 || $postings == $dup) {
                 $old_word_docs_len = strlen(
-                    $this->words[$first][$second][$rest_id]);
-                $this->words[$first][$second][$rest_id] = $dup;
+                    $this->words[$word_id]);
+                $this->words[$word_id] = $dup;
                 $this->word_docs_len -= $old_word_docs_len;
                 $this->word_docs_len += strlen($dup);
             } else {
-                $this->words[$first][$second][$rest_id] .= $postings;
+                $this->words[$word_id] .= $postings;
                 $this->word_docs_len += $postings_len;
             }
-        }}}
+        }

         $this->docids_len += $index_shard->docids_len;
         $this->num_docs += $index_shard->num_docs;
@@ -575,11 +598,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants
             $doc_key = crawlHash($duplicate, true);
             $this->doc_infos .= $doc_key . pack("N", self::DUPLICATE_FLAG).
                 pack("N", 0xFFFFFFFF);
-            $word_key = crawlHash("info:".$duplicate, true);
-            $first = $word_key[0];
-            $second =  $word_key[1];
-            $rest_id = substr($word_key, 2);
-            $this->words[$first][$second][$rest_id] =
+            $word_id = crawlHash("info:".$duplicate, true);
+            $this->words[$word_id] =
                 pack("N", $this->docids_len);
             $this->docids_len += 16;
         }
@@ -591,9 +611,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      */
     public function save()
     {
-        $this->computeFirstsSeconds();
-        $header = pack("N", $this->firsts_len) .
-            pack("N", $this->seconds_len) .
+        $this->prepareWordsAndPrefixes();
+        $header =  pack("N", $this->prefixes_len) .
             pack("N", $this->words_len) .
             pack("N", $this->word_docs_len) .
             pack("N", $this->docids_len) .
@@ -604,64 +623,65 @@ class IndexShard extends PersistentStructure implements CrawlConstants
             pack("N", $this->len_all_link_docs);
         $fh = fopen($this->filename, "wb");
         fwrite($fh, $header);
+        fwrite($fh, $this->prefixes);
         $this->packWordDocs($fh);
         fwrite($fh, $this->word_docs);
         fwrite($fh, $this->doc_infos);
         fclose($fh);
     }

-    function computeFirstsSeconds()
+    /**
+     *
+     */
+    function prepareWordsAndPrefixes()
     {
-        $this->firsts_len = 0;
-        $this->seconds_len = 0;
-        $this->words_len = 0;
+        $this->words_len = count($this->words) * IndexShard::WORD_ITEM_LEN;
+        ksort($this->words, SORT_STRING);
+        $blank = pack("N", 0xFFFFFFFF).pack("N", 0xFFFFFFFF);
+        $tmp = array();
+        $offset = 0;
+        $num_words = 0;
+        $old_prefix = false;
+        $word_item_len = IndexShard::WORD_ITEM_LEN;
         foreach($this->words as $first => $rest) {
-            $this->firsts_len += 4;
-            $len = count($rest) << 2;
-            $this->firsts[$first] = $len;
-            foreach($rest as $second => $words) {
-                $third = count($this->words[$first][$second]) *
-                    IndexShard::WORD_ITEM_LEN;
-                $this->seconds[$first][$second] = $third;
-                $this->words_len += $third;
+            $prefix = (ord($first[0]) << 8) + ord($first[1]);
+            if($old_prefix === $prefix) {
+                $num_words++;
+            } else {
+                if($old_prefix !== false) {
+                    $tmp[$old_prefix] = pack("N", $offset) .
+                        pack("N", $num_words);
+                    $offset += $num_words * $word_item_len;
+                }
+                $old_prefix = $prefix;
+                $num_words = 1;
+            }
+        }
+        $tmp[$old_prefix] = pack("N", $offset) . pack("N", $num_words);
+        $num_prefixes = 2 << 16;
+        $this->prefixes = "";
+        for($i = 0; $i < $num_prefixes; $i++) {
+            if(isset($tmp[$i])) {
+                $this->prefixes .= $tmp[$i];
+            } else {
+                $this->prefixes .= $blank;
             }
-            $this->seconds_len += $len;
         }
+        $this->prefixes_len = strlen($this->prefixes);
     }

     function packWordDocs($fh = null)
     {
-        if($fh == null) {
-            $this->computeFirstsSeconds();
-        }
-        $this->word_docs = "";
-        $this->word_docs_len = 0;
-        if($fh != null) {
-            array_walk($this->firsts, function (&$value, $key, &$fh) {
-                $out = pack("N", (ord($key) << 24) + $value);
-                fwrite($fh, $out);
-            }, $fh);
-
-            array_walk_recursive($this->seconds, function (&$value, $key, &$fh){
-                $out = pack("N", (ord($key) << 24) + $value);
-                fwrite($fh, $out);
-            }, $fh);
-        }
         $this->word_docs_len = 0;
         $this->word_docs = "";
-        foreach($this->words as $first => $seconds) {
-            foreach($seconds as $second => $rest) {
-                ksort($rest); // write out sorted, so can binary search on disk
-                foreach($rest as $rest_id => $postings) {
-                    $len = strlen($postings);
-                    $out = pack("N", $this->word_docs_len).pack("N", $len);
-                    $this->word_docs .= $postings;
-                    $this->word_docs_len += $len;
-                    $this->words[$first][$second][$rest_id] = $out;
-                    if($fh != null) {
-                        fwrite($fh, $rest_id . $out);
-                    }
-                }
+        foreach($this->words as $word_id => $postings) {
+            $len = strlen($postings);
+            $out = pack("N", $this->word_docs_len).pack("N", $len);
+            $this->word_docs .= $postings;
+            $this->word_docs_len += $len;
+            $this->words[$word_id] = $out;
+            if($fh != null) {
+                fwrite($fh, $word_id . $out);
             }
         }
         $this->word_docs_packed = true;
@@ -674,20 +694,34 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      */
     function unpackWordDocs()
     {
-        foreach($this->words as $first => $seconds) {
-            foreach($seconds as $second => $rest) {
-                foreach($rest as $rest_id => $postings_info) {
-                    $offset = $this->unpackInt(substr($postings_info, 0, 4));
-                    $len = $this->unpackInt(substr($postings_info, 4, 4));
-                    $postings = substr($this->word_docs, $offset, $len);
-                    $this->words[$first][$second][$rest_id] = $postings;
-                }
-            }
+        foreach($this->words as $word_id => $postings_info) {
+            $offset = $this->unpackInt(substr($postings_info, 0, 4));
+            $len = $this->unpackInt(substr($postings_info, 4, 4));
+            $postings = substr($this->word_docs, $offset, $len);
+            $this->words[$word_id] = $postings;
         }
         unset($this->word_docs);
         $this->word_docs_packed = false;
     }

+    /**
+     *
+     */
+     function packPosting($doc_index, $occurrences)
+     {
+        return pack("N", ($doc_index << 8) + $occurrences);
+     }
+
+    /**
+     *
+     */
+     function unpackPosting($posting)
+     {
+        $doc_int = $this->unpackInt($posting);
+        $occurrences = $doc_int & 255;
+        $doc_index = ($doc_int >> 8);
+        return array($doc_index, $occurrences);
+     }

     /**
      * Returns the first offset, last offset, and number of documents the
@@ -704,57 +738,36 @@ class IndexShard extends PersistentStructure implements CrawlConstants
     {
         $this->getShardHeader();
         $word_item_len = self::WORD_ITEM_LEN;
-        $first = $word_id[0];
-        $second = $word_id[1];
-        if(!isset($this->firsts) || $this->firsts == null ||
-            count($this->firsts) == 0) {
-            /*  if firsts not read in yet assume seconds not as well
-                seconds is about 256k, so hope memcache is active
-             */
-            $firsts = $this->getShardSubstring(self::HEADER_LENGTH,
-                $this->firsts_len);
-            $seconds = $this->getShardSubstring(self::HEADER_LENGTH +
-                $this->firsts_len,
-                $this->seconds_len);
-            $this->unpackFirstSeconds($firsts, $seconds);
-            unset($firsts);
-            unset($seconds);
+        $prefix = (ord($word_id[0]) << 8) + ord($word_id[1]);
+        $prefix_info = $this->getShardSubstring(
+            self::HEADER_LENGTH + 8*$prefix, 8);
+        $blank = pack("N", 0xFFFFFFFF).pack("N", 0xFFFFFFFF);
+        if($prefix_info == $blank) {
+            return false;
         }
+        $offset = $this->unpackInt(substr($prefix_info, 0, 4));

-        $start = self::HEADER_LENGTH + $this->firsts_len +
-            $this->seconds_len;
-        $high = 0;
-        foreach($this->seconds as $first_let => $seconds) {
-            foreach($seconds as $second_let => $third_len) {
-                if($first_let == $first && $second_let == $second) {
-                    $high = floor($third_len/$word_item_len) - 1;
-                    break 2;
-                }
-                $start += $third_len;
-            }
-        }
+        $high = $this->unpackInt(substr($prefix_info, 4, 4)) - 1;

+        $start = self::HEADER_LENGTH + $this->prefixes_len  + $offset;
         $low = 0;
-
-        $check_loc = ($low + $high >> 1);
-
+        $check_loc = (($low + $high) >> 1);
         do {
             $old_check_loc = $check_loc;
             $word_string = $this->getShardSubstring($start +
-                $word_item_len +$check_loc * $word_item_len,
-                $word_item_len);
+                $check_loc * $word_item_len, $word_item_len);
             if($word_string == false) {return false;}
-            $word_string = $this->getShardSubstring($start
-                +$check_loc * $word_item_len,
-                $word_item_len);
-            $id = substr($word_string, 0, 6);
-            $cmp = strcmp($word_id, $first.$second.$id);
+            $id = substr($word_string, 0, 8);
+            $cmp = strcmp($word_id, $id);
             if($cmp === 0) {
-                return $this->getWordInfoFromString(substr($word_string, 6));
+                return $this->getWordInfoFromString(substr($word_string, 8));
             } else if ($cmp < 0) {
                 $high = $check_loc;
                 $check_loc = (($low + $check_loc) >> 1);
             } else {
+                if($check_loc + 1 == $high) {
+                    $check_loc++;
+                }
                 $low = $check_loc;
                 $check_loc = (($high + $check_loc) >> 1);
             }
@@ -775,18 +788,14 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      */
     function getWordInfoLoaded($word_id)
     {
-        $first = $word_id[0];
-        $second = $word_id[1];
-        $rest_id = substr($word_id, 2);
-        if(!isset($this->words[$first][$second][$rest_id])) {
+        if(!isset($this->words[$word_id])) {
             return false;
         }
         if(!$this->word_docs_packed){
             $this->packWordDocs();
         }
-
         return $this->getWordInfoFromString(
-            $this->words[$first][$second][$rest_id]);
+            $this->words[$word_id]);
     }

     /**
@@ -800,7 +809,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
     {
         $first_offset = self::unpackInt(substr($str, 0, 4));
         $len = self::unpackInt(substr($str, 4, 4));
-        $last_offset = $first_offset + $len;
+        $last_offset = $first_offset + $len - self::POSTING_LEN;
         $count = $len >> 2;

         return array($first_offset, $last_offset, $count);
@@ -819,7 +828,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
     {
         if($this->read_only_from_disk) {
             $base_offset = self::HEADER_LENGTH +
-                $this->firsts_len + $this->seconds_len + $this->words_len;
+                $this->prefixes_len + $this->words_len;
             return $this->getShardSubstring($base_offset + $offset, $len);
         }
         return substr($this->word_docs, $offset, $len);
@@ -837,8 +846,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants
     function getDocInfoSubstring($offset, $len)
     {
         if($this->read_only_from_disk) {
-            $base_offset = self::HEADER_LENGTH + $this->words_len
-                + $this->firsts_len + $this->seconds_len + $this->word_docs_len;
+            $base_offset = self::HEADER_LENGTH + $this->prefixes_len +
+                $this->words_len + $this->word_docs_len;
             return $this->getShardSubstring($base_offset + $offset, $len);
         }
         return substr($this->doc_infos, $offset, $len);
@@ -912,26 +921,6 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         self::headerToShardFields($header, $this);
     }

-    /**
-     *
-     */
-    function unpackFirstSeconds($firsts, $seconds)
-    {
-        $pre_firsts_array = str_split($firsts, 4);
-        array_walk($pre_firsts_array, 'IndexShard::makeFirsts', $this);
-
-        $total_offset = 0;
-        foreach($this->firsts as $first => $seconds_len) {
-            for($offset=0; $offset < $seconds_len; $offset += 4) {
-                $pre_out = self::unpackInt(
-                    substr($seconds,$total_offset +$offset,4));
-                $second = chr(($pre_out >> 24));
-                $third_len = 0x00FFFFFF & $pre_out;
-                $this->seconds[$first][$second] = $third_len;
-            }
-            $total_offset += $seconds_len;
-        }
-    }

     /**
      *  Load an IndexShard from a file
@@ -945,30 +934,16 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         $fh = fopen($fname, "rb");
         $header = fread($fh, self::HEADER_LENGTH);
         self::headerToShardFields($header, $shard);
-        $firsts = fread($fh, $shard->firsts_len);
-        $seconds = fread($fh, $shard->seconds_len);
+        fread($fh, $shard->prefixes_len );
         $words = fread($fh, $shard->words_len);
         $shard->word_docs = fread($fh, $shard->word_docs_len);
         $shard->doc_infos = fread($fh, $shard->docids_len);
         fclose($fh);
-        $shard->unpackFirstSeconds($firsts, $seconds);
-        unset($firsts);
-        unset($seconds);
-        $total_offset = 0;
-        foreach($shard->seconds as $first => $seconds_info) {
-            foreach($seconds_info as $second => $third_len) {
-                for($offset = 0; $offset < $third_len;
-                    $offset += self::WORD_ITEM_LEN) {
-                    $value = substr($words,
-                        $total_offset + $offset, self::WORD_ITEM_LEN);
-                    $rest_id = substr($value, 0, 6);
-                    $info = substr($value, 6);
-                    $shard->words[$first][$second][$rest_id] = $info;
-                }
-                $total_offset += $third_len;
-            }
-        }
+
+        $pre_words_array = str_split($words, self::WORD_ITEM_LEN);
         unset($words);
+        array_walk($pre_words_array, 'IndexShard::makeWords', $shard);
+
         return $shard;
     }

@@ -983,16 +958,15 @@ class IndexShard extends PersistentStructure implements CrawlConstants
     {
         $header_array = str_split($header, 4);
         $header_data = array_map('IndexShard::unpackInt', $header_array);
-        $shard->firsts_len = $header_data[0];
-        $shard->seconds_len = $header_data[1];
-        $shard->words_len = $header_data[2];
-        $shard->word_docs_len = $header_data[3];
-        $shard->docids_len = $header_data[4];
-        $shard->generation_offset = $header_data[5];
-        $shard->num_docs = $header_data[6];
-        $shard->num_link_docs = $header_data[7];
-        $shard->len_all_docs = $header_data[8];
-        $shard->len_all_link_docs = $header_data[9];
+        $shard->prefixes_len = $header_data[0];
+        $shard->words_len = $header_data[1];
+        $shard->word_docs_len = $header_data[2];
+        $shard->docids_len = $header_data[3];
+        $shard->generation_offset = $header_data[4];
+        $shard->num_docs = $header_data[5];
+        $shard->num_link_docs = $header_data[6];
+        $shard->len_all_docs = $header_data[7];
+        $shard->len_all_link_docs = $header_data[8];
     }

     /**
@@ -1007,12 +981,18 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         return $tmp[1];
     }

-    static function makeFirsts(&$value, $key, &$shard)
+
+    /**
+     * Callback function for load method. splits a word_key . word_info string
+     * into an entry in the passed shard $shard->words[word_key] = $word_info.
+     *
+     * @param string &value  the word_key . word_info string
+     * @param int $key index in array - we don't use
+     * @param object $shard IndexShard to add the entry to word table for
+     */
+    static function makeWords(&$value, $key, &$shard)
     {
-        $pre_out = self::unpackInt($value);
-        $first = chr($pre_out >> 24);
-        $seconds_len = (0x00FFFFFF & $pre_out);
-        $shard->firsts[$first] = $seconds_len;
+        $shard->words[substr($value, 0, 8)] = substr($value, 8, 8);
     }

 }
diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php
index 22d2d9008..b281028b1 100755
--- a/lib/phrase_parser.php
+++ b/lib/phrase_parser.php
@@ -133,7 +133,7 @@ class PhraseParser
     static function extractPhrasesOfLengthOffset($string,
         $phrase_len, $offset)
     {
-        $punct = "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\|";
+        $punct = "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\||\&";
         $words = mb_split("[[:space:]]|".$punct, $string);

         $stems = array();
diff --git a/lib/utility.php b/lib/utility.php
index 9fbbf6918..c35cab166 100755
--- a/lib/utility.php
+++ b/lib/utility.php
@@ -200,10 +200,8 @@ function crawlCrypt($string, $salt = NULL)
  * precision
  *
  * @param string $start starting time with microseconds
- * @param string $end ending time with microseconds
+ * @param string $end ending time with microseconds, if null use current time
  * @return float time difference in seconds
- * @see SigninModel::changePassword()
- * @see SigninModel::checkValidSignin()
  */
 function changeInMicrotime( $start, $end=NULL )
 {
diff --git a/models/phrase_model.php b/models/phrase_model.php
index 134775de1..5acd07ded 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -276,7 +276,7 @@ class PhraseModel extends Model
         $index_archive_name = self::index_data_base_name . $index_name;
         $index_archive = new IndexArchiveBundle(
             CRAWL_DIR.'/cache/'.$index_archive_name);
-        $punct = "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\|";
+        $punct = "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\||\&";
         $phrase_string = mb_ereg_replace($punct, " ", $phrase_string);
         $phrase_string = preg_replace("/(\s)+/", " ", $phrase_string);
         /*
@@ -322,7 +322,7 @@ class PhraseModel extends Model
             $restrict_phrases = array_unique($restrict_phrases);
             $restrict_phrases = array_filter($restrict_phrases);
             $index_archive->setCurrentShard(0, true);
-            $words_array = $index_archive->getSelectiveWords($hashes, 10);
+            $words_array = $index_archive->getSelectiveWords($hashes, 5);

             if(is_array($words_array)) {
                 reset($words_array);
diff --git a/tests/index_shard_test.php b/tests/index_shard_test.php
index 4bc2c6144..36cbe65b9 100644
--- a/tests/index_shard_test.php
+++ b/tests/index_shard_test.php
@@ -81,7 +81,7 @@ class IndexShardTest extends UnitTest
     /**
      * Check if can store documents into an index shard and retrieve them
      */
-    public function addDocumentsGetWordSliceByIdTestCase()
+    public function addDocumentsGetPostingsSliceByIdTestCase()
     {
         $docid = "AAAAAAAA";
         $offset = 5;
@@ -101,7 +101,7 @@ class IndexShardTest extends UnitTest
         $this->assertEqual($this->test_objects['shard']->len_all_docs, 9,
             "Len All Docs Correctly Counts Length of First Doc");

-        $c_data = $this->test_objects['shard']->getWordSliceById(
+        $c_data = $this->test_objects['shard']->getPostingsSliceById(
             crawlHash('CCCCCCCC', true), 5);
         $this->assertTrue(isset($c_data["AAAAAAAA"]),
             "Doc lookup by word works");
@@ -117,7 +117,7 @@ class IndexShardTest extends UnitTest
         );
         $this->test_objects['shard']->addDocumentWords($docid,
             $offset, $word_counts, $meta_ids);
-        $c_data = $this->test_objects['shard']->getWordSliceById(
+        $c_data = $this->test_objects['shard']->getPostingsSliceById(
             crawlHash('CCCCCCCC', true), 5);
         $this->assertTrue(isset($c_data["AAAAAAAA"]),
             "Work lookup first item of two works");
@@ -127,7 +127,7 @@ class IndexShardTest extends UnitTest
             "Exactly two items were found in two item case");

         //add a meta word lookup
-        $c_data = $this->test_objects['shard']->getWordSliceById(
+        $c_data = $this->test_objects['shard']->getPostingsSliceById(
             crawlHash('EEEEEEEE', true), 5);
         $this->assertTrue(isset($c_data["AAAAAAAA"]),
             "Doc lookup by meta word works");
@@ -140,7 +140,7 @@ class IndexShardTest extends UnitTest
     /**
      * Check if can store link documents into an index shard and retrieve them
      */
-    public function addLinkGetWordSliceByIdTestCase()
+    public function addLinkGetPostingsSliceByIdTestCase()
     {
         $docid = "AAAAAAAA:BBBBBBBB:CCCCCCCC"; //set up link doc
         $offset = 5;
@@ -159,7 +159,7 @@ class IndexShardTest extends UnitTest
             $offset, $word_counts, $meta_ids);
         $this->assertEqual($this->test_objects['shard']->len_all_link_docs, 9,
             "Len All Docs Correctly Counts Length of First Doc");
-        $c_data = $this->test_objects['shard']->getWordSliceById(
+        $c_data = $this->test_objects['shard']->getPostingsSliceById(
             crawlHash('MMMMMMMM', true), 5);
         $this->assertTrue(isset($c_data["AAAAAAAA:BBBBBBBB:CCCCCCCC"]),
             "Link Doc lookup by word works");
@@ -178,7 +178,7 @@ class IndexShardTest extends UnitTest

         $this->test_objects['shard']->addDocumentWords($docid,
             $offset, $word_counts, $meta_ids);
-        $c_data = $this->test_objects['shard']->getWordSliceById(
+        $c_data = $this->test_objects['shard']->getPostingsSliceById(
             crawlHash('MMMMMMMM', true), 5);
         $this->assertTrue(isset($c_data["AAAAAAAA:BBBBBBBB:CCCCCCCC"]),
             "Link Doc lookup by word works 1st of two");
@@ -233,15 +233,15 @@ class IndexShardTest extends UnitTest
             $offset, $word_counts, $meta_ids);
         $this->test_objects['shard']->appendIndexShard(
             $this->test_objects['shard2']);
-        $c_data = $this->test_objects['shard']->getWordSliceById(
+        $c_data = $this->test_objects['shard']->getPostingsSliceById(
             crawlHash('BBBBBBBB', true), 5);
         $this->assertTrue(isset($c_data["AAAAAAAA"]),
             "Data from first shard present 1");
-        $c_data = $this->test_objects['shard']->getWordSliceById(
+        $c_data = $this->test_objects['shard']->getPostingsSliceById(
             crawlHash('CCCCCCCC', true), 5);
         $this->assertTrue(isset($c_data["AAAAAAAA"]),
             "Data from first shard present 2");
-        $c_data = $this->test_objects['shard']->getWordSliceById(
+        $c_data = $this->test_objects['shard']->getPostingsSliceById(
             crawlHash('DDDDDDDD', true), 5);
         $this->assertTrue(isset($c_data["AAAAAAAA"]),
             "Data from first shard present 3");
@@ -249,27 +249,27 @@ class IndexShardTest extends UnitTest
             "Data from second shard present 1");
         $this->assertTrue(isset($c_data["GGGGGGGG"]),
             "Data from third shard present 1");
-        $c_data = $this->test_objects['shard']->getWordSliceById(
+        $c_data = $this->test_objects['shard']->getPostingsSliceById(
             crawlHash('EEEEEEEE', true), 5);
         $this->assertTrue(isset($c_data["AAAAAAAA"]),
             "Data from first shard present 4");
-        $c_data = $this->test_objects['shard']->getWordSliceById(
+        $c_data = $this->test_objects['shard']->getPostingsSliceById(
             crawlHash('FFFFFFFF', true), 5);
         $this->assertTrue(isset($c_data["AAAAAAAA"]),
             "Data from first shard present 5");
-        $c_data = $this->test_objects['shard']->getWordSliceById(
+        $c_data = $this->test_objects['shard']->getPostingsSliceById(
             crawlHash('ZZZZZZZZ', true), 5);
         $this->assertTrue(isset($c_data["KKKKKKKK:GGGGGGGG:HHHHHHHH"]),
             "Data from second shard present 2");
-        $c_data = $this->test_objects['shard']->getWordSliceById(
+        $c_data = $this->test_objects['shard']->getPostingsSliceById(
             crawlHash('IIIIIIII', true), 5);
         $this->assertTrue(isset($c_data["GGGGGGGG"]),
             "Data from third shard present 2");
-        $c_data = $this->test_objects['shard']->getWordSliceById(
+        $c_data = $this->test_objects['shard']->getPostingsSliceById(
             crawlHash('JJJJJJJJ', true), 5);
         $this->assertTrue(isset($c_data["GGGGGGGG"]),
             "Data from third shard present 3");
-        $c_data = $this->test_objects['shard']->getWordSliceById(
+        $c_data = $this->test_objects['shard']->getPostingsSliceById(
             crawlHash('KKKKKKKK', true), 5);
         $this->assertTrue(isset($c_data["GGGGGGGG"]),
             "Data from third shard present 4");
@@ -317,7 +317,7 @@ class IndexShardTest extends UnitTest
         );
         $this->test_objects['shard']->addDocumentWords($docid,
             $offset, $word_counts, $meta_ids);
-        $c_data = $this->test_objects['shard']->getWordSliceById(
+        $c_data = $this->test_objects['shard']->getPostingsSliceById(
             crawlHash('BBBBBBBB', true), 5);
         $new_doc_offsets = array(
             "AAAAAAAA" => 5,
@@ -326,7 +326,7 @@ class IndexShardTest extends UnitTest
             "DDDDDDDD" => 7,
         );
         $this->test_objects['shard']->changeDocumentOffsets($new_doc_offsets);
-        $c_data = $this->test_objects['shard']->getWordSliceById(
+        $c_data = $this->test_objects['shard']->getPostingsSliceById(
             crawlHash('BBBBBBBB', true), 5);
         $predicted_offsets = array(
             "AAAAAAAA" => 5,
@@ -359,7 +359,7 @@ class IndexShardTest extends UnitTest
         $doc_urls = array("http://somewhere.com/");

         $this->test_objects['shard']->markDuplicateDocs($doc_urls);
-        $c_data = $this->test_objects['shard']->getWordSliceById(
+        $c_data = $this->test_objects['shard']->getPostingsSliceById(
             crawlHash('info:http://somewhere.com/', true), 5);
         $this->assertTrue(isset(
             $c_data[crawlHash($doc_urls[0], true)][CrawlConstants::DUPLICATE]),
@@ -390,23 +390,23 @@ class IndexShardTest extends UnitTest
         $this->test_objects['shard2'] = IndexShard::load("shard.txt");
         $this->assertEqual($this->test_objects['shard2']->len_all_docs, 9,
             "Len All Docs Correctly Counts Length of First Doc");
-        $c_data = $this->test_objects['shard2']->getWordSliceById(
+        $c_data = $this->test_objects['shard2']->getPostingsSliceById(
             crawlHash('BBBBBBBB', true), 5);
         $this->assertTrue(isset($c_data["AAAAAAAA"]),
             "Doc lookup by word works");
-        $c_data = $this->test_objects['shard2']->getWordSliceById(
+        $c_data = $this->test_objects['shard2']->getPostingsSliceById(
             crawlHash('CCCCCCCC', true), 5);
         $this->assertTrue(isset($c_data["AAAAAAAA"]),
             "Doc lookup 2 by word works");
-        $c_data = $this->test_objects['shard2']->getWordSliceById(
+        $c_data = $this->test_objects['shard2']->getPostingsSliceById(
             crawlHash('DDDDDDDD', true), 5);
         $this->assertTrue(isset($c_data["AAAAAAAA"]),
             "Doc lookup 2 by word works");
-        $c_data = $this->test_objects['shard2']->getWordSliceById(
+        $c_data = $this->test_objects['shard2']->getPostingsSliceById(
             crawlHash('EEEEEEEE', true), 5);
         $this->assertTrue(isset($c_data["AAAAAAAA"]),
             "Doc lookup 2 by word works");
-        $c_data = $this->test_objects['shard2']->getWordSliceById(
+        $c_data = $this->test_objects['shard2']->getPostingsSliceById(
             crawlHash('FFFFFFFF', true), 5);
         $this->assertTrue(isset($c_data["AAAAAAAA"]),
             "Doc lookup 2 by word works");
ViewGit