Fixes bugs in single word case of IndexArchiveBundle

Chris Pollett [2010-07-25 17:Jul:th]
Fixes bugs in single word case of IndexArchiveBundle
Filename
bin/fetcher.php
bin/queue_server.php
controllers/settings_controller.php
lib/bloom_filter_bundle.php
lib/bloom_filter_file.php
lib/crawl_constants.php
lib/crawl_daemon.php
lib/hash_table.php
lib/index_archive_bundle.php
lib/phrase_parser.php
lib/porter_stemmer.php
lib/priority_queue.php
lib/processors/pdf_processor.php
lib/string_array.php
lib/web_archive.php
lib/web_archive_bundle.php
lib/web_queue_bundle.php
models/phrase_model.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index cfb5ba4bb..484b06a84 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -60,15 +60,15 @@ foreach(glob(BASE_DIR."/lib/processors/*_processor.php") as $filename) {

 /** To support English language stemming of words (jumps, jumping --> jump)*/
 require_once BASE_DIR."/lib/porter_stemmer.php";
-/** */
+/** Used to manipulate urls*/
 require_once BASE_DIR."/lib/url_parser.php";
-/** */
+/** Used to extract summaries from web pages*/
 require_once BASE_DIR."/lib/phrase_parser.php";
 /** for crawlHash and crawlLog */
 require_once BASE_DIR."/lib/utility.php";
 /** for crawlDaemon function */
 require_once BASE_DIR."/lib/crawl_daemon.php";
-/** */
+/** Used to fetches web pages and info from queue server*/
 require_once BASE_DIR."/lib/fetch_url.php";
 /** Loads common constants for web crawling*/
 require_once BASE_DIR."/lib/crawl_constants.php";
@@ -107,65 +107,93 @@ mb_regex_encoding("UTF-8");
 class Fetcher implements CrawlConstants
 {
     /**
+     * Reference to a database object. Used since has directory manipulation
+     * functions
      * @var object
      */
     var $db;
     /**
-     * @var object
+     * Url or IP address of the queue_server to get sites to crawl from
+     * @var string
      */
     var $queue_server;
     /**
+     * Contains each of the file extenstions this fetcher will try to process
      * @var array
      */
     var $indexed_file_types;
     /**
+     * An associative array of (mimetype => name of processor class to handle)
+     * pairs.
      * @var array
      */
     var $page_processors;
     /**
+     * WebArchiveBundle  used to store complete web pages and auxiliary data
      * @var object
      */
     var $web_archive;
     /**
+     * Timestamp of the current crawl
      * @var int
      */
     var $crawl_time;
     /**
+     * Contains the list of web pages to crawl from the queue_server
      * @var array
      */
     var $to_crawl;
     /**
+     * Summary information for visited sites that the fetcher hasn't sent to
+     * the queue_server yet
      * @var array
      */
     var $found_sites;
     /**
+     * Timestamp from the queue_server of the current schedule of sites to
+     * download. This is sent back to the server once this schedule is completed
+     * to help the queue server implement crawl-delay if needed.
      * @var int
      */
     var $schedule_time;
     /**
+     * The sum of the number of words of all the page description for the current
+     * crawl. This is used in computing document statistics.
      * @var int
      */
     var $sum_seen_site_description_length;
     /**
+     * The sum of the number of words of all the page titles for the current
+     * crawl. This is used in computing document statistics.
      * @var int
      */
     var $sum_seen_title_length;
     /**
+     * The sum of the number of words in all the page links for the current
+     * crawl. This is used in computing document statistics.
      * @var int
      */
     var $sum_seen_site_link_length;
     /**
+     * Number of sites crawled in the current crawl
      * @var int
      */
     var $num_seen_sites;
     /**
+     * Stores the name of the ordering used to crawl pages. This is used in a
+     * switch/case when computing weights of urls to be crawled before sending
+     * these new urls back to the queue_server.
      * @var string
      */
     var $crawl_order;


     /**
+     * Sets up the field variables for that crawling can begin
      *
+     * @param array $indexed_file_types file extensions to index
+     * @param array $page_processors (mimetype => name of processor) pairs
+     * @param string $queue_server URL or IP address of the queue server
      */
     function __construct($indexed_file_types, $page_processors, $queue_server)
     {
@@ -189,7 +217,7 @@ class Fetcher implements CrawlConstants
         $this->num_seen_sites = 0;

         //we will get the correct crawl order from the queue_server
-        $this->crawl_order = "OPIC";
+        $this->crawl_order = self::PAGE_IMPORTANCE;
     }


@@ -209,7 +237,13 @@ class Fetcher implements CrawlConstants
     }

     /**
+     * Main loop for the fetcher.
      *
+     * Checks for stop message, checks queue server if crawl has changed and
+     * for new pages to crawl. Loop gets a group of next pages to crawl if
+     * there are pages left to crawl (otherwise sleep 5 seconds). It downloads
+     * these pages, deplicates them, and updates the found site info with the
+     * result before looping again.
      */
     function loop()
     {
@@ -292,6 +326,11 @@ class Fetcher implements CrawlConstants
     }

     /**
+     * Deletes any crawl web archive bundles not in the provided array of crawls
+     *
+     * @param array $still_active_crawls those crawls which should be deleted,
+     *      so all others will be deleted
+     * @see loop()
      */
     function deleteOldCrawls(&$still_active_crawls)
     {
@@ -341,13 +380,17 @@ class Fetcher implements CrawlConstants
     }

     /**
+     * Get status, current crawl, crawl order, and new site information from
+     * the queue_server.
+     *
+     * @return array containing this info
      */
     function checkScheduler()
     {
         $info = array();
-
+        $info[self::STATUS]  = self::CONTINUE_STATE;
         if(count($this->to_crawl) > 0) {
-            $info[self::STATUS]  = self::CONTINUE_STATE;
+
             return;
         }

@@ -374,18 +417,24 @@ class Fetcher implements CrawlConstants
         if(isset($info[self::SCHEDULE_TIME])) {
               $this->schedule_time = $info[self::SCHEDULE_TIME];
         }
+
         crawlLog("  Time to check Scheduler ".(changeInMicrotime($start_time)));

-        return $info;
+        return $info;
     }

     /**
+     * Prepare an array of up to NUM_MULTI_CURL_PAGES' worth of sites to be
+     * downloaded in one go using the to_crawl array. Delete these sites
+     * from the to_crawl array.
+     *
+     * @return array sites which are ready to be downloaded
      */
     function getFetchSites()
     {

         $web_archive = $this->web_archive;
-
+
         $start_time = microtime();

         $seeds = array();
@@ -439,6 +488,11 @@ class Fetcher implements CrawlConstants
     }

     /**
+     * Does page deduplication on an array of downloaded pages using a
+     * BloomFilterBundle of $this->web_archive. Deduplication based
+     * on summaries is also done on the queue server.
+     *
+     * @param array &$site_pages pages to deduplicate
      */
     function deleteSeenPages(&$site_pages)
     {
@@ -466,6 +520,14 @@ class Fetcher implements CrawlConstants
     }

     /**
+     * Processes an array of downloaded web pages with the appropriate page
+     * processor.
+     *
+     * Summary data is extracted from each non robots.txt file in the array.
+     * Disallowed paths and crawl-delays are extracted from robots.txt files.
+     *
+     * @param array $site_pages a collection of web pages to process
+     * @return array summary data extracted from these pages
      */
     function processFetchPages($site_pages)
     {
@@ -600,7 +662,13 @@ class Fetcher implements CrawlConstants
     }

     /**
+     * Parses the contents of a robots.txt page extracting disallowed paths and
+     * Crawl-delay
      *
+     * @param array $robot_site array containing info about one robots.txt page
+     * @return array the $robot_site array with two new fields: one containing
+     *      an array of disallowed paths, the other containing the crawl-delay
+     *      if any
      */
     function processRobotPage($robot_site)
     {
@@ -658,7 +726,15 @@ class Fetcher implements CrawlConstants
     }

     /**
-    */
+     * Updates the $this->found_sites array with data from the most recently
+     * downloaded sites. This means updating the following sub arrays:
+     * the self::ROBOT_PATHS, self::TO_CRAWL. It checks if there are still
+     * more urls to crawl or if self::SEEN_URLS has grown larger than
+     * SEEN_URLS_BEFORE_UPDATE_SCHEDULER. If so, a mini index is built and,
+     * the queue server is called with the data.
+     *
+     * @param array $sites site data to use for the update
+     */
     function updateFoundSites($sites)
     {
         $start_time = microtime();
@@ -676,7 +752,7 @@ class Fetcher implements CrawlConstants
                         self::CRAWL_DELAY] = $site[self::CRAWL_DELAY];
                 }
             } else {
-                $this->found_sites[self::SEEN_URLS][] = $site;
+                $this->found_sites[self::ROBOT_PATHS][] = $site;
                 if(isset($site[self::LINKS])) {
                     if(!isset($this->found_sites[self::TO_CRAWL])) {
                         $this->found_sites[self::TO_CRAWL] = array();
@@ -730,7 +806,16 @@ class Fetcher implements CrawlConstants
     }

     /**
+     * Updates the queue_server about sites that have been crawled.
      *
+     * This method is called if there are currently no more sites to crawl or
+     * if SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages have been processed. It
+     * creates a inverted index of the non robot pages crawled and then compresses
+     * and does a post request to send the page summary data, robot data,
+     * to crawl url data, and inverted index back to the server. In the event
+     * that the server doesn't acknowledge it loops and tries again after a
+     * delay until the post is successful. At this point, memory for this data
+     * is freed.
      */
     function updateScheduler()
     {
@@ -788,6 +873,15 @@ class Fetcher implements CrawlConstants
     }

     /**
+     * Builds an inverted index (word --> {docs it appears in}) for the current
+     * batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages. This inverted
+     * is then merged by the queue_server into the inverted index of the
+     * current generation of the crawl. The complete inverted index for the
+     * whole crawl is built out of these inverted indexes for generations.
+     * The point of computing a partial inverted index on the fetcher is to
+     * reduce some of the computational burden on the queue server. The
+     * resulting mini index computed by buildMiniInvertedIndex() is stored in
+     * $this->found_sites[self::INVERTED_INDEX]
      *
      */
     function buildMiniInvertedIndex()
@@ -926,7 +1020,11 @@ class Fetcher implements CrawlConstants


     /**
+     * Used to compute number of words in each component (title, description,
+     * links) of a document separately as well as compute average amongst the
+     * current group of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many docs.
      *
+     * @return array computed statistics
      */
     function computeDocumentStatistics()
     {
@@ -1003,7 +1101,7 @@ class Fetcher implements CrawlConstants
     /**
      * Computes a sum of the values of an associative array of key-value pairs
      *
-     * @param array the associative array to compute the sum of
+     * @param array &$arr the associative array to compute the sum of
      */
     function sumCountArray(&$arr)
     {
diff --git a/bin/queue_server.php b/bin/queue_server.php
index b0e3e5a10..61a53c88d 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -52,10 +52,10 @@ require_once BASE_DIR."/models/datasources/".DBMS."_manager.php";
 /** Load the class that maintains our URL queue */
 require_once BASE_DIR."/lib/web_queue_bundle.php";

-/**  */
+/** Load word->{array of docs with word} index class */
 require_once BASE_DIR."/lib/index_archive_bundle.php";

-/**  */
+/** Used for manipulating urls*/
 require_once BASE_DIR."/lib/url_parser.php";

 /**  For crawlHash function */
diff --git a/controllers/settings_controller.php b/controllers/settings_controller.php
index 189d46934..b73802702 100755
--- a/controllers/settings_controller.php
+++ b/controllers/settings_controller.php
@@ -114,7 +114,8 @@ class SettingsController extends Controller
         $crawls = $this->crawlModel->getCrawlList();
         $data['CRAWLS'] = array();
         foreach($crawls as $crawl) {
-            $data['CRAWLS'][$crawl['CRAWL_TIME']] = $crawl['DESCRIPTION'];
+            $data['CRAWLS'][$crawl['CRAWL_TIME']] = $crawl['DESCRIPTION'].
+                " ... ".$crawl['COUNT']." urls";
         }
         $crawl_stamps = array_keys($data['CRAWLS']);
         if($token_okay && isset($_REQUEST['index_ts']) &&
diff --git a/lib/bloom_filter_bundle.php b/lib/bloom_filter_bundle.php
index 6710bd8d9..32ff0be7b 100644
--- a/lib/bloom_filter_bundle.php
+++ b/lib/bloom_filter_bundle.php
@@ -35,14 +35,15 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

 /**
  *
- * Code used to manage a bloom filter in-memory and in file
- * a Bloom filter is used to store a set of objects.
- * It can support inserts into the set and it can also be
- * used to check membership in the set.
+ * A BloomFilterBundle is a directory of BloomFilterFile.
+ * The filter bundle, like a Bloom filter, also acts as a set,
+ * but once the active filter in it fills up a new filter is
+ * added to the bundle so that more data can be stored.
  *
  * @author Chris Pollett
  * @package seek_quarry
  * @subpackage library
+ * @see BloomFilterFile
  */
 class BloomFilterBundle
 {
@@ -78,9 +79,14 @@ class BloomFilterBundle
     const default_filter_size = 10000000;

     /**
+     * Creates or loads if already exists the directory structure and
+     * BloomFilterFiles used by this bundle
      *
+     * @param $dir_name directory when this bundles data is stored
+     * @param $filter_size the size of an individual filter in this bundle
+     *      once a filter is filled a new one is added to the directory
      */
-    public function __construct($dir_name,
+    function __construct($dir_name,
         $filter_size = self::default_filter_size )
     {
         $this->dir_name = $dir_name;
@@ -107,9 +113,14 @@ class BloomFilterBundle
     }

     /**
+     * Inserts a $value into the BloomFilterBundle
      *
+     * This involves inserting into the current filter, if the filter
+     * is full, a new filter is added before the value is added
+     *
+     * @param string $value a item to add to the filter bundle
      */
-    public function add($value)
+    function add($value)
     {
         if($this->current_filter_count >= $this->filter_size) {
             $this->current_filter->save();
@@ -130,9 +141,14 @@ class BloomFilterBundle
     }

     /**
+     * Removes from the passed array those elements $elt who either are not in
+     * the filter bundle or whose $elt[$field_name] is not in the bundle.
      *
+     * @param array &$arr the array to remove elements from
+     * @param string $field_name if not NULL the field name of $arr to use to
+     *      do filtering
      */
-    public function differenceFilter(&$arr, $field_name = NULL)
+    function differenceFilter(&$arr, $field_name = NULL)
     {

         $num_filters = $this->num_filters;
@@ -160,9 +176,10 @@ class BloomFilterBundle
     }

     /**
-     *
+     * Loads from the filter bundles' meta.txt the meta data associated with
+     * this filter bundle and stores this data into field variables
      */
-    public function loadMetaData()
+    function loadMetaData()
     {
         if(file_exists($this->dir_name.'/meta.txt')) {
             $meta = unserialize(
@@ -181,7 +198,7 @@ class BloomFilterBundle
      * Saves the meta data (number of filter, number of items stored, and size)
      * of the bundle
      */
-    public function saveMetaData()
+    function saveMetaData()
     {
         $meta = array();
         $meta['NUM_FILTERS'] = $this->num_filters;
@@ -194,7 +211,7 @@ class BloomFilterBundle
     /**
      * Used to save to disk all the file data associated with this bundle
      */
-    public function forceSave()
+    function forceSave()
     {
         $this->saveMetaData();
         $this->current_filter->save();
diff --git a/lib/bloom_filter_file.php b/lib/bloom_filter_file.php
index d816928c9..77c494d87 100755
--- a/lib/bloom_filter_file.php
+++ b/lib/bloom_filter_file.php
@@ -34,14 +34,13 @@
 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

 /**
- *
+ * Load base class with methods for loading and saving this structure
  */
 require_once "persistent_structure.php";

 /**
- *
- * Code used to manage a bloom filter in-memory and in file
- * a Bloom filter is used to store a set of objects.
+ * Code used to manage a bloom filter in-memory and in file.
+ * A Bloom filter is used to store a set of objects.
  * It can support inserts into the set and it can also be
  * used to check membership in the set.
  *
@@ -52,14 +51,35 @@ require_once "persistent_structure.php";
 class BloomFilterFile extends PersistentStructure
 {

+    /**
+     * Number of bit positions in the Bloom filter used to say an item is
+     * in the filter
+     * @var int
+     */
     var $num_keys;
+    /**
+     * Size in bits of the packed string array used to store the filter's
+     * contents
+     * @var int
+     */
     var $filter_size;
+    /**
+     * Packed string used to store the Bloom filters
+     * @var string
+     */
     var $filter;

     /**
+     * Initializes the fields of the BloomFilter and its base
+     * PersistentStructure.
      *
+     * @param string $fname name of the file to store the BloomFilter data in
+     * @param int $num_values the maximum number of values that will be stored
+     *      in the BloomFilter. Filter will be sized so the odds of a false
+     *      positive are roughly one over this value
+     * @param int $save_frequency how often to store the BloomFilter to disk
      */
-    public function __construct($fname, $num_values,
+    function __construct($fname, $num_values,
         $save_frequency = self::DEFAULT_SAVE_FREQUENCY)
     {
         $log2 = log(2);
@@ -75,9 +95,11 @@ class BloomFilterFile extends PersistentStructure
     }

     /**
+     * Inserts the provided item into the Bloomfilter
      *
+     * @param string $value item to add to filter
      */
-    public function add($value)
+    function add($value)
     {
         $num_keys = $this->num_keys;
         for($i = 0;  $i < $num_keys; $i++) {
@@ -89,9 +111,12 @@ class BloomFilterFile extends PersistentStructure
     }

     /**
+     * Checks if the BloomFilter contains the provided $value
      *
+     * @param string $value item to check if is in the BloomFilter
+     * @return bool whether $value was in the filter or not
      */
-    public function contains($value)
+    function contains($value)
     {
         $num_keys = $this->num_keys;
         for($i = 0;  $i < $num_keys; $i++) {
@@ -106,7 +131,10 @@ class BloomFilterFile extends PersistentStructure
     }

     /**
+     * Hashes $value to a bit position in the BloomFilter
      *
+     * @param string $value value to map to a bit position in the filter
+     * @return int the bit position mapped to
      */
     function getHashBitPosition($value)
     {
@@ -120,7 +148,9 @@ class BloomFilterFile extends PersistentStructure
     }

     /**
+     * Sets to true the ith bit position in the filter.
      *
+     * @param int $i the position to set to true
      */
     function setBit($i)
     {
@@ -129,13 +159,16 @@ class BloomFilterFile extends PersistentStructure
         $bit_in_byte = $i - ($byte << 3);

         $tmp = $this->filter[$byte];
-
+
         $this->filter[$byte] = $tmp | chr(1 << $bit_in_byte);

     }

     /**
+     * Looks up the value of the ith bit position in the filter
      *
+     * @param int $i the position to look up
+     * @return bool the value of the looked up position
      */
     function getBit($i)
     {
diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php
index 022265052..da45cb25a 100644
--- a/lib/crawl_constants.php
+++ b/lib/crawl_constants.php
@@ -34,7 +34,9 @@
 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

 /**
- *
+ * Shared constants and enums used by components that are involved in the
+ * crawling process
+ *
  * @author Chris Pollett
  * @package seek_quarry
  * @subpackage library
diff --git a/lib/crawl_daemon.php b/lib/crawl_daemon.php
index 6c46a3c3c..cb3039077 100644
--- a/lib/crawl_daemon.php
+++ b/lib/crawl_daemon.php
@@ -34,14 +34,20 @@
 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

 /**
- *
+ * Load system-wide defines
  */
-
 require_once BASE_DIR."/configs/config.php";
-require_once BASE_DIR."/lib/utility.php"; //for crawlLog
+/**
+ * Load the crawlLog function
+ */
+require_once BASE_DIR."/lib/utility.php";
+/**
+ *  Load common constants for crawling
+ */
 require_once BASE_DIR."/lib/crawl_constants.php";

 /**
+ * Used to run scripts as a daemon on *nix systems
  *
  * @author Chris Pollett
  * @package seek_quarry
@@ -49,10 +55,19 @@ require_once BASE_DIR."/lib/crawl_constants.php";
  */
 class CrawlDaemon implements CrawlConstants
 {
+
+    /**
+     * Name prefix to be used on files associated with this daemon
+     * (such as lock like and messages)
+     * @var string
+     * @static
+     */
     static $name;

     /**
+     * Callback function to handle signals sent to this daemon
      *
+     * @param int $signo signal sent to the daemon
      */
     static function processHandler($signo)
     {
@@ -65,7 +80,7 @@ class CrawlDaemon implements CrawlConstants
                  file_put_contents(
                     CRAWL_DIR."/schedules/".self::$name."_messages.txt",
                     serialize($info));
-                 unlink(CRAWL_DIR."/schedules/".self::$name."_lock.txt");
+                 unlink(CRAWL_DIR."/schedules/".self::$name."_lock.txt");
              break;

              case SIGSEGV:
@@ -79,7 +94,17 @@ class CrawlDaemon implements CrawlConstants
     }

     /**
+     * Used to send a message the given daemon or run the program in the
+     * foreground.
      *
+     * @param array $argv an array of command line arguments. The argument
+     *      start will check if the process control functions exists if these
+     *      do they will fork and detach a child process to act as a daemon.
+     *      a lock file will be created to prevent additional daemons from
+     *      running. If the message is stop then a message file is written to
+     *      tell the daemon to stop. If the argument is terminal then the
+     *      program won't be run as a daemon.
+     * @param string $name the prefix to use for lock and message files
      */
     static function init($argv, $name)
     {
@@ -103,8 +128,6 @@ class CrawlDaemon implements CrawlConstants
         //the next code is for running as a daemon on *nix systems
         $terminal_flag = strcmp($argv[1], "terminal") == 0;
         if(function_exists("pcntl_fork") && !$terminal_flag)  {
-
-
             $pid = pcntl_fork();
             if ($pid == -1) {
                 die("could not fork");
diff --git a/lib/hash_table.php b/lib/hash_table.php
index 7a547d06b..49474e0dd 100755
--- a/lib/hash_table.php
+++ b/lib/hash_table.php
@@ -52,21 +52,54 @@ require_once "utility.php";
 class HashTable extends StringArray
 {

+    /**
+     *
+     * @var int
+     */
     var $key_size;
+    /**
+     *
+     * @var int
+     */
     var $value_size;
+    /**
+     *
+     * @var string
+     */
     var $null;
+    /**
+     *
+     * @var string
+     */
     var $deleted;
-
+    /**
+     *
+     * @var int
+     */
     var $count;

+    /**
+     *
+     *
+     */
     const ALWAYS_RETURN_PROBE = 1;
+    /**
+     *
+     *
+     */
     const RETURN_PROBE_ON_KEY_FOUND = 0;
+    /**
+     *
+     *
+     */
     const RETURN_VALUE = -1;


     /**
-    */
-    public function __construct($fname, $num_values, $key_size, $value_size,
+     *
+     *
+     */
+    function __construct($fname, $num_values, $key_size, $value_size,
         $save_frequency = self::DEFAULT_SAVE_FREQUENCY)
     {
         $this->key_size = $key_size;
@@ -80,7 +113,11 @@ class HashTable extends StringArray
             $key_size + $value_size, $save_frequency);
     }

-    public function insert($key, $value)
+    /**
+     *
+     *
+     */
+    function insert($key, $value)
     {
         $null = $this->null;
         $deleted = $this->deleted;
@@ -124,13 +161,20 @@ class HashTable extends StringArray
     }


+    /**
+     *
+     *
+     */
     function lookup($key, $return_probe_value = self::RETURN_VALUE)
     {
         return $this->lookupArray(
             $key, array($this->null), $return_probe_value);
     }

-
+    /**
+     *
+     *
+     */
     function lookupArray($key, $null_array,
         $return_probe_value = self::RETURN_VALUE)
     {
@@ -167,7 +211,11 @@ class HashTable extends StringArray

     }

-    public function delete($key)
+    /**
+     *
+     *
+     */
+    function delete($key)
     {
         $deleted = pack("H2x".($this->key_size + $this->value_size - 1), "FF");
             //deletes
@@ -185,6 +233,10 @@ class HashTable extends StringArray

     }

+    /**
+     *
+     *
+     */
     function getEntry($i)
     {
         $raw = $this->get($i);
@@ -194,6 +246,10 @@ class HashTable extends StringArray
         return array($key, $value);
     }

+    /**
+     *
+     *
+     */
     function hash($key)
     {
         $hash = substr(md5($key, true), 0, 4);
diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php
index 5b349929a..86f3083f1 100644
--- a/lib/index_archive_bundle.php
+++ b/lib/index_archive_bundle.php
@@ -34,19 +34,42 @@
 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

 /**
- *  Load in all dependencies for IndexArchiveBundle, if necessary
- */
-
+ * Summaries and word document list stored in WebArchiveBundle's so load it
+ */
 require_once 'web_archive_bundle.php';
+/**
+ * Filters used to check if words appear in a given generation
+ */
 require_once 'bloom_filter_file.php';
+/**
+ *
+ */
 require_once 'bloom_filter_bundle.php';
+/**
+ *
+ */
 require_once 'gzip_compressor.php';
+/**
+ *
+ */
 require_once 'non_compressor.php';
+/**
+ *
+ */
 require_once 'utility.php';
-/** Loads common constants for web crawling*/
+/** Loads common constants for web crawling
+ *
+ */
 require_once 'crawl_constants.php';

 /**
+ * Enumerative interface for common constants between WordIterator and
+ * IndexArchiveBundle
+ *
+ * These constants are used as fields in arrays. They are negative to
+ * distinguish them from normal array elements 0, 1, 2... However, this
+ * means you need to be slightly careful if you try to sort the array
+ * as this might screw things up
  *
  * @author Chris Pollett
  * @package seek_quarry
@@ -64,7 +87,12 @@ interface IndexingConstants


 /**
+ * Callback function used to set the offsets into the archive file from
+ * the paritcular word info in the header block of a WordArchive
  *
+ * @param array $data
+ * @param array $objects
+ * @param string $offset_field
  */
 function setOffsetPointers($data, &$objects, $offset_field)
 {
@@ -100,21 +128,22 @@ function setOffsetPointers($data, &$objects, $offset_field)
  *
  * @author Chris Pollett
  * @package seek_quarry
+ * @subpackage library
  */
 class WordIterator implements IndexingConstants, CrawlConstants
 {
     var $word_key;
     var $index;
     var $seen_docs;
+    var $restricted_seen_docs;
+    var $count_block_unfiltered;
     var $num_docs;
-    var $diagnostics;

-    //common word fields
+
     var $next_offset;
     var $last_pointed_block;
     var $list_offset;

-    //rare word fields

     var $block_pointers;
     var $num_full_blocks;
@@ -127,25 +156,31 @@ class WordIterator implements IndexingConstants, CrawlConstants
     /**
      *
      */
-    public function __construct($word_key, $index, $limit = 0)
+    public function __construct($word_key, $index, $limit = 0, $info_block = NULL)
     {
         $this->word_key = $word_key;
         $this->index = $index;
         $this->limit = $limit;
-        $this->reset();
+        $this->reset($info_block);
     }

     /**
      *
      */
-    public function reset()
+    public function reset($info_block = NULL)
     {
+        $this->restricted_seen_docs = 0;
+        $this->count_block_unfiltered = 0;
+
         $partition =
             WebArchiveBundle::selectPartition($this->word_key,
                 $this->index->num_partitions_index);

-        $this->info_block = $this->index->getPhraseIndexInfo($this->word_key);
-
+        if($info_block == NULL) {
+        	    $this->info_block = $this->index->getPhraseIndexInfo($this->word_key);
+        } else {
+            $this->info_block = $info_block;
+        }
         if($this->info_block !== NULL) {
             $this->num_generations = count($this->info_block['GENERATIONS']);
             $count_till_generation = $this->info_block[self::COUNT];
@@ -171,8 +206,10 @@ class WordIterator implements IndexingConstants, CrawlConstants

         }

+        $this->seen_docs = $count_till_generation -
+            $this->info_block[self::COUNT];
         $this->initGeneration();
-        $this->seen_docs = $this->current_pointer * BLOCK_SIZE;
+

     }

@@ -193,8 +230,9 @@ class WordIterator implements IndexingConstants, CrawlConstants
             $this->num_docs = $info_block['TOTAL_COUNT'];
             $this->num_docs_generation = $info_block[self::COUNT];

-            $this->current_pointer = floor($this->limit / BLOCK_SIZE);
-
+            $this->current_pointer =
+                max(floor(($this->limit - $this->seen_docs) / BLOCK_SIZE), 0);
+            $this->seen_docs += $this->current_pointer*BLOCK_SIZE;
             $this->last_block = $info_block[self::END_BLOCK];
             $this->num_full_blocks =
                 floor($this->num_docs_generation / BLOCK_SIZE);
@@ -215,8 +253,7 @@ class WordIterator implements IndexingConstants, CrawlConstants
                 if($info_block[self::LIST_OFFSET] === NULL) {
                     $this->list_offset = NULL;
                 } else {
-                    $this->list_offset = $info_block[self::LIST_OFFSET][0];
-                    $this->current_block_num =$info_block[self::LIST_OFFSET][1];
+                    $this->list_offset = $info_block[self::LIST_OFFSET];
                 }
             }

@@ -240,14 +277,53 @@ class WordIterator implements IndexingConstants, CrawlConstants
             if($this->current_pointer == $this->num_full_blocks) {
                 $pages = $this->last_block;
             } else if ($this->current_pointer >= $this->last_pointed_block) {
+                /* if there are more than COMMON_WORD_THRESHOLD many
+                   results and we're not at the last block yet
+                 */
                 if($this->list_offset === NULL) {
                     return -1;
                 }
-                $doc_block = $this->index->getWordDocBlock($this->word_key,
-                    $this->list_offset, $generation);
-
-                $pages = $doc_block[$this->word_key.":".$this->current_pointer];
+                $offset = $this->list_offset;
+                $found = false;
+                do {
+                    /* the link list is actually backwards to the order we want
+                       For now, we cycle along the list from the last data
+                       stored until we find the block we want. This is slow
+                       but we are relying on the fact that each generation is
+                       not too big.
+                     */
+                    $doc_block = $this->index->getWordDocBlock($this->word_key,
+                        $offset, $generation);
+                    $word_keys = array_keys($doc_block);
+                    $found_key = NULL;
+                    foreach($word_keys as $word_key) {
+                        if(strstr($word_key, $this->word_key.":")) {
+                            $found_key = $word_key;
+                            if(isset($doc_block[
+                                $found_key][self::LIST_OFFSET])) {
+                                //only one list offset/docblock
+                                break;
+                            }
+                        }
+                    }
+                    if($found_key === NULL) {
+                        break;
+                    }
+                    if(isset($doc_block[
+                        $this->word_key.":".$this->current_pointer])) {
+                        $found = true;
+                        break;
+                    }
+                    $offset = $doc_block[$found_key][self::LIST_OFFSET];
+                } while($offset != NULL);
+                if($found != true) {
+                    $pages = array();
+                } else {
+                    $pages = $doc_block[
+                        $this->word_key.":".$this->current_pointer];
+                }
             } else {
+                //first COMMON_WORD_THRESHOLD many results fast
                 if(isset($this->block_pointers[$this->current_pointer])) {
                     $doc_block = $this->index->getWordDocBlock($this->word_key,
                         $this->block_pointers[$this->current_pointer],
@@ -265,15 +341,18 @@ class WordIterator implements IndexingConstants, CrawlConstants
                     $pages = array();
                 }
             }
-
+
             if($this->seen_docs < $this->limit) {
                 $diff_offset = $this->limit - $this->seen_docs;
+
                 $pages = array_slice($pages, $diff_offset);
             }
+            $this->count_block_unfiltered = count($pages);

             if($restrict_phrases != NULL) {
+
                  $out_pages = array();
-                 if(count($pages) >0 ) {
+                 if(count($pages) > 0 ) {
                      foreach($pages as $doc_key => $doc_info) {

                          if(isset($doc_info[self::SUMMARY_OFFSET])) {
@@ -314,28 +393,27 @@ class WordIterator implements IndexingConstants, CrawlConstants
     public function nextDocsWithWord($restrict_phrases = NULL)
     {
         $doc_block = $this->currentDocsWithWord($restrict_phrases);
-
-        $this->seen_docs += count($doc_block);
-
+        if($this->seen_docs <  $this->limit) {
+            $this->seen_docs = $this->count_block_unfiltered + $this->limit;
+        } else {
+        	    $this->seen_docs += $this->count_block_unfiltered;
+        }
+        $this->restricted_seen_docs += count($doc_block);
         if($doc_block == -1 || !is_array($doc_block)) {
             return NULL;
         }
-        if(isset($doc_block[self::LIST_OFFSET]) &&
-            $doc_block[self::LIST_OFFSET] != NULL) {
-            $this->list_offset = $doc_block[self::LIST_OFFSET];
-        }
-
+
         $this->current_pointer ++;
         if($this->current_pointer > $this->num_full_blocks) {
             $flag = false;
             while ($this->info_block['CURRENT_GENERATION_INDEX'] <
-                $this->num_generations -1 && !$flag) {
+                $this->num_generations - 1 && !$flag) {
                 $this->info_block['CURRENT_GENERATION_INDEX']++;
                 $flag = $this->initGeneration();
             }
             if ($this->info_block['CURRENT_GENERATION_INDEX'] >=
-                $this->num_generations -1) {
-                $this->current_pointer = -1;
+                $this->num_generations - 1) {
+                $this->current_pointer = - 1;
             }
         }

@@ -349,10 +427,11 @@ class WordIterator implements IndexingConstants, CrawlConstants
  *
  * @author Chris Pollett
  * @package seek_quarry
+ * @subpackage library
  */
 class IndexArchiveBundle implements IndexingConstants, CrawlConstants
 {
-
+    var $diagnostics;
     var $dir_name;
     var $description;
     var $num_partitions_summaries;
@@ -627,22 +706,23 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
      *
      */
     public function getSummariesByHash($word_key, $limit, $num,
-        $restrict_phrases = NULL, $phrase_key = NULL)
+        $restrict_phrases = NULL, $phrase_key = NULL, $phrase_info = NULL)
     {
         if($phrase_key ==  NULL) {
             $phrase_key = $word_key;
         }
-
-        $phrase_info = $this->getPhraseIndexInfo($phrase_key);
+
+        if($phrase_info == NULL) {
+            	$phrase_info = $this->getPhraseIndexInfo($phrase_key);
+        }

         if($phrase_info == NULL || (isset($phrase_info[self::PARTIAL_COUNT])
             && $phrase_info[self::PARTIAL_COUNT] < $limit + $num)) {
-
             $this->addPhraseIndex(
                 $word_key, $restrict_phrases, $phrase_key, $limit + $num);
         }

-        $iterator = new WordIterator($phrase_key, $this, $limit);
+        $iterator = new WordIterator($phrase_key, $this, $limit, $phrase_info);

         $num_retrieved = 0;
         $pages = array();
@@ -650,6 +730,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
          while(is_array($next_docs = $iterator->nextDocsWithWord()) &&
             $num_retrieved < $num) {
              $num_docs_in_block = count($next_docs);
+
              foreach($next_docs as $doc_key => $doc_info) {
                  if(isset($doc_info[self::SUMMARY_OFFSET])) {
                      $page = $this->getPage(
@@ -764,7 +845,6 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
                     }
                 }
             }
-
             $num_generations = count($info['GENERATIONS']);
             if($num_generations == 0) {
                 return NULL;
@@ -776,7 +856,6 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
                 $block_info =
                     $this->readPartitionInfoBlock(
                         $partition, $info['GENERATIONS'][$i]);
-
                 $sum_count += $block_info[$phrase_key][self::COUNT];
             }

@@ -799,7 +878,6 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
                 $info['CURRENT_GENERATION_INDEX'];
             $phrase_info['TOTAL_COUNT'] = $info['TOTAL_COUNT'];
             $phrase_info['GENERATIONS'] = $info['GENERATIONS'];
-
             return $phrase_info;
         } else {
             return NULL;
@@ -857,7 +935,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
                 $word_data[$phrase_key] =
                     array_slice($buffer, 0, COMMON_WORD_THRESHOLD);

-                $this->addPartitionWordData($partition,$word_data, $first_time);
+                $this->addPartitionWordData($partition, $word_data, $first_time);
                 $first_time = false;
                 $buffer = array_slice($buffer, COMMON_WORD_THRESHOLD);
                 $current_count += COMMON_WORD_THRESHOLD;
@@ -954,3 +1032,4 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

 }
 ?>
+
diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php
index 37343e4ae..c22b1b664 100755
--- a/lib/phrase_parser.php
+++ b/lib/phrase_parser.php
@@ -34,7 +34,7 @@
 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

 /**
- *  load the stem word function, if necessary
+ *  Load the stem word function, if necessary
  */
 require_once BASE_DIR."/lib/porter_stemmer.php";

@@ -44,7 +44,7 @@ require_once BASE_DIR."/lib/porter_stemmer.php";
 require_once BASE_DIR."/lib/crawl_constants.php";

 /**
- * library of functions used to manipulate words and phrases
+ * Library of functions used to manipulate words and phrases
  *
  *
  * @author Chris Pollett
@@ -55,7 +55,11 @@ require_once BASE_DIR."/lib/crawl_constants.php";
 class PhraseParser
 {
     /**
+     * Converts a summary of a web page into a string of space separated words
      *
+     * @param array $page associateive array of page summary data. Contains
+     *      title, description, and links fields
+     * @return string the concatenated words extracted from the page summary
      */
     static function extractWordStringPageSummary($page)
     {
@@ -80,7 +84,12 @@ class PhraseParser
     }

     /**
+     * Extracts all phrases (sequences of adjacent words) from $string of
+     * length less than or equal to $len.
      *
+     * @param string $string subject to extract phrases from
+     * @param int $len longest length of phrases to consider
+     * @return array pairs of the form (phrase, number of occurrences)
      */
     static function extractPhrasesAndCount($string,
         $len =  MAX_PHRASE_LEN)
@@ -99,7 +108,12 @@ class PhraseParser
     }

     /**
+     * Extracts all phrases (sequences of adjacent words) from $string of
+     * length exactly equal to $len.
      *
+     * @param string $string subject to extract phrases from
+     * @param int $len length of phrases to consider
+     * @return array pairs of the form (phrase, number of occurrences)
      */
     static function extractPhrasesOfLength($string, $phrase_len)
     {
@@ -114,7 +128,15 @@ class PhraseParser
     }

     /**
+     * Extracts phrases (sequences of adjacent words) from $string of
+     * length exactly equal to $len, beginning with the $offset'th word.
+     * This extracts the the $len many words after offset, then the $len
+     * many words after that, and so on.
      *
+     * @param string $string subject to extract phrases from
+     * @param int $len length of phrases to consider
+     * @param int $offset the first word to begin with
+     * @return array pairs of the form (phrase, number of occurrences)
      */
     static function extractPhrasesOfLengthOffset($string,
         $phrase_len, $offset)
diff --git a/lib/porter_stemmer.php b/lib/porter_stemmer.php
index 74f6d3f99..890260322 100755
--- a/lib/porter_stemmer.php
+++ b/lib/porter_stemmer.php
@@ -50,9 +50,33 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
 class PorterStemmer
 {

-    static $buffer, $k, $j;
+    /**
+     * storage used in computing the stem
+     * @var string
+     */
+    static $buffer;
+    /**
+     * Index of the current end of the word at the current state of computing
+     * its stem
+     * @var int
+     */
+    static $k;
+    /**
+     * Index to start of the suffix of the word being considered for
+     * manipulation
+     * @var int
+     */
+    static $j;

-    public static function stem($word)
+    /**
+     * Computes the stem of an English word
+     *
+     * For example, jumps, jumping, jumpy, all have jump as a stem
+     *
+     * @param string $word the string to stem
+     * @return string the stem of $words
+     */
+    static function stem($word)
     {
         self::$buffer = $word;

@@ -74,6 +98,7 @@ class PorterStemmer
      * Checks to see if the ith character in the buffer is a consonant
      *
      * @param int $i the character to check
+     * @return if the ith character is a constant
      */
     private static function cons($i)
     {
@@ -88,15 +113,17 @@ class PorterStemmer
         }
     }

-    /** m() measures the number of consonant sequences between k0 and j. if c is
-     *  a consonant sequence and v a vowel sequence, and <.> indicates arbitrary
-     *  presence,
-
-     *    <c><v>       gives 0
-     *    <c>vc<v>     gives 1
-     *    <c>vcvc<v>   gives 2
-     *    <c>vcvcvc<v> gives 3
+    /**
+     * m() measures the number of consonant sequences between 0 and j. if c is
+     * a consonant sequence and v a vowel sequence, and [.] indicates arbitrary
+     * presence,
+     *  <pre>
+     *    [c][v]       gives 0
+     *    [c]vc[v]     gives 1
+     *    [c]vcvc[v]   gives 2
+     *    [c]vcvcvc[v] gives 3
      *    ....
+     *  </pre>
      */
     private static function m()
     {
@@ -130,7 +157,11 @@ class PorterStemmer
         }
     }

-    /* vowelinstem() is TRUE <=> k0,...j contains a vowel */
+    /**
+     * Checks if 0,...$j contains a vowel
+     *
+     * @return bool whether it does not
+     */

     private static function vowelinstem()
     {
@@ -140,7 +171,11 @@ class PorterStemmer
         return false;
     }

-    /* doublec(j) is TRUE <=> j,(j-1) contain a double consonant. */
+    /**
+     * Checks if $j,($j-1) contain a double consonant.
+     *
+     * @return bool if it does or not
+     */

     private static function doublec($j)
     {
@@ -149,14 +184,17 @@ class PorterStemmer
         return self::cons($j);
     }

-    /* cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
-       and also if the second c is not w,x or y. this is used when trying to
-       restore an e at the end of a short word. e.g.
-
-          cav(e), lov(e), hop(e), crim(e), but
-          snow, box, tray.
-
-    */
+    /**
+     * Checks whether the letters at the indices $i-2, $i-1, $i in the buffer
+     * have the form consonant - vowel - consonant and also if the second c is
+     * not w,x or y. this is used when trying to restore an e at the end of a
+     * short word. e.g.
+     *<pre>
+     *    cav(e), lov(e), hop(e), crim(e), but
+     *    snow, box, tray.
+     *</pre>
+     * @return bool whether the letters at indices have the given form
+     */

     private static function cvc($i)
     {
@@ -169,7 +207,12 @@ class PorterStemmer
         return true;
     }

-    /* ends(s) is TRUE <=> k0,...k ends with the string s. */
+    /**
+     * Checks if the buffer currently ends with the string $s
+     *
+     * @param string $s string to use for check
+     * @return bool whether buffer currently ends with $s
+     */

     private static function ends($s)
     {
@@ -184,8 +227,12 @@ class PorterStemmer
         return true;
     }

-    /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
-       k. */
+    /**
+     * setto($s) sets (j+1),...k to the characters in the string $s, readjusting
+     * k.
+     *
+     * @param string $s string to modify the end of buffer with
+     */

     private static function setto($s)
     {
@@ -195,34 +242,38 @@ class PorterStemmer
         self::$k = self::$j + $len;
     }

-    /* r(s) is used further down. */
-
+    /**
+     * Sets the ending in the buffer to $s if the number of consonant sequences
+     * between $k and $j is positive.
+     *
+     * @param string $s what to change the suffix to
+     */
     private static function r($s)
     {
         if (self::m() > 0) self::setto($s);
     }

-    /* step1ab() gets rid of plurals and -ed or -ing. e.g.
-
-           caresses  ->  caress
-           ponies    ->  poni
-           ties      ->  ti
-           caress    ->  caress
-           cats      ->  cat
-
-           feed      ->  feed
-           agreed    ->  agree
-           disabled  ->  disable
-
-           matting   ->  mat
-           mating    ->  mate
-           meeting   ->  meet
-           milling   ->  mill
-           messing   ->  mess
-
-           meetings  ->  meet
-
-    */
+    /** step1ab() gets rid of plurals and -ed or -ing. e.g.
+     * <pre>
+     *     caresses  ->  caress
+     *     ponies    ->  poni
+     *     ties      ->  ti
+     *     caress    ->  caress
+     *     cats      ->  cat
+     *
+     *     feed      ->  feed
+     *     agreed    ->  agree
+     *     disabled  ->  disable
+     *
+     *     matting   ->  mat
+     *     mating    ->  mate
+     *     meeting   ->  meet
+     *     milling   ->  mill
+     *     messing   ->  mess
+     *
+     *     meetings  ->  meet
+     * </pre>
+     */

     private static function step1ab()
     {
@@ -256,7 +307,9 @@ class PorterStemmer
        }
     }

-    /* step1c() turns terminal y to i when there is another vowel in the stem. */
+    /**
+     * step1c() turns terminal y to i when there is another vowel in the stem.
+     */

     private static function step1c()
     {
@@ -266,9 +319,11 @@ class PorterStemmer
     }


-    /* step2() maps double suffices to single ones. so -ization ( = -ize plus
-       -ation) maps to -ize etc.Note that the string before the suffix must give
-       m() > 0. */
+    /**
+     * step2() maps double suffices to single ones. so -ization ( = -ize plus
+     * -ation) maps to -ize etc.Note that the string before the suffix must give
+     * m() > 0.
+     */
     private static function step2()
     {
         if(self::$k < 1) return;
@@ -314,7 +369,9 @@ class PorterStemmer
         }
     }

-    /* step3() deals with -ic-, -full, -ness etc. similar strategy to step2. */
+    /**
+     * step3() deals with -ic-, -full, -ness etc. similar strategy to step2.
+     */

     private static function step3()
     {
@@ -338,8 +395,9 @@ class PorterStemmer
         }
     }

-    /* step4() takes off -ant, -ence etc., in context <c>vcvc<v>. */
-
+    /**
+     * step4() takes off -ant, -ence etc., in context <c>vcvc<v>.
+     */
     private static function step4()
     {
         if(self::$k < 1) return;
diff --git a/lib/priority_queue.php b/lib/priority_queue.php
index d6973d2ef..b50468948 100755
--- a/lib/priority_queue.php
+++ b/lib/priority_queue.php
@@ -34,11 +34,20 @@
 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

 /**
- *  Load in base classes and interfaces,get the crawlHash function, if necessary
+ *  Load in base class
  */
 require_once "string_array.php";
+/**
+ *
+ */
 require_once "notifier.php";
+/**
+ *
+ */
 require_once "utility.php";
+/**
+ *
+ */
 require_once "crawl_constants.php";

 /**
@@ -53,19 +62,43 @@ require_once "crawl_constants.php";
  */
 class PriorityQueue extends StringArray implements CrawlConstants
 {
+    /**
+     *
+     *
+     */
     var $num_values;
+    /**
+     *
+     *
+     */
     var $value_size;
+    /**
+     *
+     *
+     */
     var $weight_size = 4; //size of a float

+    /**
+     *
+     *
+     */
     var $count;
+    /**
+     *
+     *
+     */
     var $min_or_max;

+    /**
+     *
+     *
+     */
     var $notifier; // who to call if move an item in queue

     /**
      *
      */
-    public function __construct($fname, $num_values, $value_size,
+    function __construct($fname, $num_values, $value_size,
         $min_or_max, $notifier = NULL,
         $save_frequency = self::DEFAULT_SAVE_FREQUENCY)
     {
@@ -85,7 +118,7 @@ class PriorityQueue extends StringArray implements CrawlConstants
     /**
      *
      */
-    public function peek($i = 1)
+    function peek($i = 1)
     {
         if($i < 1 || $i > $this->count) {
             crawlLog("Peek Index $i not in Range [1, {$this->count}]");
@@ -97,7 +130,7 @@ class PriorityQueue extends StringArray implements CrawlConstants
     /**
      *
      */
-    public function poll($i = 1)
+    function poll($i = 1)
     {
         if($i < 1 || $i > $this->count) {
             crawlLog("Index $i not in Range [1, {$this->count}]");
@@ -118,7 +151,7 @@ class PriorityQueue extends StringArray implements CrawlConstants
     /**
      *
      */
-    public function insert($data, $weight)
+    function insert($data, $weight)
     {
         if($this->count == $this->num_values) {
             return false;
@@ -136,7 +169,7 @@ class PriorityQueue extends StringArray implements CrawlConstants
     /**
      *
      */
-    public function adjustWeight($i, $delta)
+    function adjustWeight($i, $delta)
     {
         if( ($tmp = $this->peek($i)) === false) {
             crawlLog("Index $i not in queue adjust weight failed");
@@ -166,7 +199,7 @@ class PriorityQueue extends StringArray implements CrawlConstants
     /**
      *
      */
-    public function printContents()
+    function printContents()
     {
         for($i = 1; $i <= $this->count; $i++) {
             $row = $this->peek($i);
@@ -177,7 +210,7 @@ class PriorityQueue extends StringArray implements CrawlConstants
     /**
      *
      */
-    public function getContents()
+    function getContents()
     {
         $rows = array();
         for($i = 1; $i <= $this->count; $i++) {
@@ -190,7 +223,7 @@ class PriorityQueue extends StringArray implements CrawlConstants
     /**
      *
      */
-    public function normalize($new_total = NUM_URLS_QUEUE_RAM)
+    function normalize($new_total = NUM_URLS_QUEUE_RAM)
     {
         $count = $this->count;
         $total_weight = $this->totalWeight();
diff --git a/lib/processors/pdf_processor.php b/lib/processors/pdf_processor.php
index 2c5e84112..d83df545e 100755
--- a/lib/processors/pdf_processor.php
+++ b/lib/processors/pdf_processor.php
@@ -34,7 +34,7 @@
 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

 /**
- *
+ * Load in the base class if necessary
  */

 require_once BASE_DIR."/lib/processors/text_processor.php";
diff --git a/lib/string_array.php b/lib/string_array.php
index 7d61ec1c5..3907eb805 100755
--- a/lib/string_array.php
+++ b/lib/string_array.php
@@ -52,34 +52,41 @@ class StringArray extends PersistentStructure
 {

     /**
-     *
-     */
-    var $filename;
-    /**
-     *
+     * Number of items to be stored in the StringArray
+     * @var int
      */
     var $num_values;
     /**
-     *
+     * Size of each item in bytes to be stored
+     * @var int
      */
-    var $array_size;
+    var $data_size;
     /**
-     *
+     * Number of bytes of storage need by the string array
+     * @var int
      */
-    var $data_size;
+    var $array_size;
     /**
-     *
+     * Character string used to store the packed data of the StringArray
+     * @var string
      */
     var $string_array;


     /**
+     * Initiliazes the fields of the StringArray and its parent class
+     * PersistentStructure. Creates a null filled string array of size
+     * $this->string_array_size to stored data in.
      *
+     * @param string $fname the name of the file to store data persistently in
+     * @param int $num_values the number of items the StringArray will store
+     * @param int $data_size the size in bytes of a single item
+     * @param int $save_frequency how often the StringArray should be stored to
+     *      disk
      */
     public function __construct($fname, $num_values, $data_size,
         $save_frequency = self::DEFAULT_SAVE_FREQUENCY)
     {
-        $this->filename = $fname;
         $this->num_values = $num_values;
         $this->data_size = $data_size;

@@ -93,7 +100,10 @@ class StringArray extends PersistentStructure


     /**
+     *  Looks up the ith item in the StringArray
      *
+     *  @param int $i array index of item to look up
+     *  @return string the looked-up item of length $this->data_size
      */
     public function get($i)
     {
@@ -103,7 +113,11 @@ class StringArray extends PersistentStructure
     }

     /**
+     * Puts data into the ith item of the StringArray
      *
+     * @param int $i array index of where to store data
+     * @param string $data at least $this->data_size many bytes of data to
+     *      store
      */
     public function put($i, $data)
     {
diff --git a/lib/web_archive.php b/lib/web_archive.php
index bf4dabf78..b41a00dc7 100755
--- a/lib/web_archive.php
+++ b/lib/web_archive.php
@@ -50,19 +50,39 @@ require_once "utility.php";
 class WebArchive
 {

-    const OPEN_AND_CLOSE = 1;
-    const OPEN = 2;
-    const CLOSE = 3;
-
+    /**
+     *
+     */
     var $filename;
+    /**
+     *
+     */
     var $iterator_pos;
+    /**
+     *
+     */
     var $compressor;
+    /**
+     *
+     */
     var $count;

     /**
      *
      */
-    public function __construct($fname, $compressor, $fast_construct = false)
+    const OPEN_AND_CLOSE = 1;
+    /**
+     *
+     */
+    const OPEN = 2;
+    /**
+     *
+     */
+    const CLOSE = 3;
+    /**
+     *
+     */
+    function __construct($fname, $compressor, $fast_construct = false)
     {
         $this->filename = $fname;
         $this->compressor = $compressor;
@@ -83,7 +103,7 @@ class WebArchive
     /**
      *
      */
-    public function readInfoBlock()
+    function readInfoBlock()
     {
         $fh =  fopen($this->filename, "r");
         $len = $this->seekEndObjects($fh);
@@ -101,7 +121,7 @@ class WebArchive
     /**
      *
      */
-    public function writeInfoBlock($fh = NULL, &$data = NULL)
+    function writeInfoBlock($fh = NULL, &$data = NULL)
     {
         $open_flag = false;
         if($fh == NULL) {
@@ -129,7 +149,7 @@ class WebArchive
     /**
      *
      */
-    public function seekEndObjects($fh)
+    function seekEndObjects($fh)
     {
         fseek($fh, - 4, SEEK_END);
         $len_block_arr = unpack("N", fread($fh, 4));
@@ -142,7 +162,7 @@ class WebArchive
     /**
      *
      */
-    public function addObjects($offset_field, &$objects,
+    function addObjects($offset_field, &$objects,
         $data = NULL, $callback = NULL, $return_flag = true)
     {

@@ -192,7 +212,7 @@ class WebArchive
     /**
      *
      */
-    public function open($mode = "r")
+    function open($mode = "r")
     {
         $fh = fopen($this->filename, $mode);
         return $fh;
@@ -201,7 +221,7 @@ class WebArchive
     /**
      * Closes a file handle (which should be of a web archive)
      */
-    public function close($fh)
+    function close($fh)
     {
         fclose($fh);
     }
@@ -209,7 +229,7 @@ class WebArchive
     /**
      *
      */
-    public function getObjects($offset, $num, $next_flag = true, $fh = NULL)
+    function getObjects($offset, $num, $next_flag = true, $fh = NULL)
     {

         $open_flag = false;
@@ -261,7 +281,7 @@ class WebArchive
      * @param int $num number of objects to return
      * @return array an array of objects from the web archive
      */
-    public function currentObjects($num)
+    function currentObjects($num)
     {
         return $this->getObjects($this->iterator_pos, $num, false);
     }
@@ -274,7 +294,7 @@ class WebArchive
      * @param int $num number of objects to return
      * @return array an array of objects from the web archive
      */
-    public function nextObjects($num)
+    function nextObjects($num)
     {
         return $this->getObjects($this->iterator_pos, $num);
     }
@@ -283,7 +303,7 @@ class WebArchive
      * Resets the iterator for this web archive to the first object
      * in the archive
      */
-    public function reset()
+    function reset()
     {
         $this->iterator_pos = 0;
     }
diff --git a/lib/web_archive_bundle.php b/lib/web_archive_bundle.php
index 72c50d3d3..ff7fac6d4 100755
--- a/lib/web_archive_bundle.php
+++ b/lib/web_archive_bundle.php
@@ -34,7 +34,7 @@
 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

 /**
- * A WebArchiveBundle is a collection of WebArchive, so need definition of
+ * A WebArchiveBundle is a collection of WebArchive, so load definition of
  * web archive
  */
 require_once 'web_archive.php';
@@ -104,7 +104,7 @@ class WebArchiveBundle
     /**
      *
      */
-    public function __construct($dir_name, $filter_size = -1,
+    function __construct($dir_name, $filter_size = -1,
         $num_partitions = NULL, $description = NULL,
         $compressor = "GzipCompressor")
     {
@@ -173,7 +173,7 @@ class WebArchiveBundle
     /**
      *
      */
-    public function addPages($key_field, $offset_field, &$pages)
+    function addPages($key_field, $offset_field, &$pages)
     {
         $partition_queue = array();
         for($i = 0; $i < $this->num_partitions; $i++) {
@@ -215,7 +215,7 @@ class WebArchiveBundle
     /**
      *
      */
-    public function getPage($key, $offset)
+    function getPage($key, $offset)
     {
         $partition =
             WebArchiveBundle::selectPartition($key, $this->num_partitions);
@@ -226,7 +226,7 @@ class WebArchiveBundle
     /**
      *
      */
-    public function getPageByPartition($partition, $offset, $file_handle = NULL)
+    function getPageByPartition($partition, $offset, $file_handle = NULL)
     {
         $page_array =
             $this->getPartition($partition)->getObjects(
@@ -242,7 +242,7 @@ class WebArchiveBundle
     /**
      *
      */
-    public function addPageFilter($key_field, &$page)
+    function addPageFilter($key_field, &$page)
     {
         if($this->filter_size > 0) {
             $this->page_exists_filter_bundle->add($page[$key_field]);
@@ -255,7 +255,7 @@ class WebArchiveBundle
     /**
      *
      */
-    public function addObjectsPartition($offset_field, $partition,
+    function addObjectsPartition($offset_field, $partition,
         &$objects, $data = NULL, $callback = NULL, $return_flag = true)
     {
         $num_objects = count($objects);
@@ -268,7 +268,7 @@ class WebArchiveBundle
     /**
      *
      */
-    public function readPartitionInfoBlock($partition)
+    function readPartitionInfoBlock($partition)
     {
         return $this->getPartition($partition)->readInfoBlock();
     }
@@ -276,7 +276,7 @@ class WebArchiveBundle
     /**
      *
      */
-    public function writePartitionInfoBlock($partition, &$data)
+    function writePartitionInfoBlock($partition, &$data)
     {
         $this->getPartition($partition)->writeInfoBlock(NULL, $data);
     }
@@ -284,7 +284,7 @@ class WebArchiveBundle
     /**
      *
      */
-    public function differencePageKeysFilter($pages, $key_field)
+    function differencePageKeysFilter($pages, $key_field)
     {
         if($this->filter_size > 0) {
             $page_array = array();
@@ -303,7 +303,7 @@ class WebArchiveBundle
     /**
      *
      */
-    public function differencePagesFilter(&$page_array, $field_name = NULL)
+    function differencePagesFilter(&$page_array, $field_name = NULL)
     {
         $this->page_exists_filter_bundle->differenceFilter(
             $page_array, $field_name);
@@ -312,7 +312,7 @@ class WebArchiveBundle
     /**
      *
      */
-    public function forceSave()
+    function forceSave()
     {
         if($this->filter_size > 0) {
            $this->page_exists_filter_bundle->forceSave();
@@ -322,7 +322,7 @@ class WebArchiveBundle
     /**
      *
      */
-    public function getPartition($index, $fast_construct = true)
+    function getPartition($index, $fast_construct = true)
     {
         if(!isset($this->partition[$index])) {
             //this might not have been open yet
@@ -355,7 +355,7 @@ class WebArchiveBundle
     /**
      *
      */
-    public static function getArchiveInfo($dir_name)
+    static function getArchiveInfo($dir_name)
     {
         if(!is_dir($dir_name) || !file_exists($dir_name."/description.txt")) {
             $info = array();
@@ -375,7 +375,7 @@ class WebArchiveBundle
     /**
      *
      */
-    public static function selectPartition($value, $num_partitions)
+    static function selectPartition($value, $num_partitions)
     {

         $hash = substr(md5($value, true), 0, 4);
diff --git a/lib/web_queue_bundle.php b/lib/web_queue_bundle.php
index 13a5abad3..f64f21f67 100755
--- a/lib/web_queue_bundle.php
+++ b/lib/web_queue_bundle.php
@@ -64,7 +64,9 @@ require_once 'web_archive.php';
 require_once 'utility.php';

 /**
- *
+ * Encapsulates the data structures needed to have a queue of urls to crawl
+ * next
+ *
  * @author Chris Pollett
  *
  * @package seek_quarry
@@ -136,12 +138,15 @@ class WebQueueBundle implements Notifier
      */
     var $crawl_delay_filter;

+    /**
+     *
+     */
     const max_url_archive_offset = 1000000000;

     /**
      *
      */
-    public function __construct($dir_name,
+    function __construct($dir_name,
         $filter_size, $num_urls_ram, $min_or_max)
     {
         $this->dir_name = $dir_name;
@@ -227,7 +232,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function addUrlsQueue(&$url_pairs)
+    function addUrlsQueue(&$url_pairs)
     {
         $add_urls = array();
         $count = count($url_pairs);
@@ -269,7 +274,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function containsUrlQueue(&$url)
+    function containsUrlQueue(&$url)
     {
         $hash_url = crawlHash($url, true);
         $lookup_url = $this->lookupHashTable($hash_url);
@@ -279,7 +284,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function adjustQueueWeight(&$url, $delta)
+    function adjustQueueWeight(&$url, $delta)
     {
         $hash_url = crawlHash($url, true);
         $data = $this->lookupHashTable($hash_url);
@@ -297,7 +302,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function removeQueue($url)
+    function removeQueue($url)
     {
         $hash_url = crawlHash($url, true);
         $data = $this->lookupHashTable($hash_url);
@@ -319,7 +324,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function peekQueue($i = 1, $fh = NULL)
+    function peekQueue($i = 1, $fh = NULL)
     {
         $tmp = $this->to_crawl_queue->peek($i);
         if(!$tmp) {
@@ -350,7 +355,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function printContents()
+    function printContents()
     {
         $count = $this->to_crawl_queue->count;

@@ -360,7 +365,7 @@ class WebQueueBundle implements Notifier
         }
     }

-    public function getContents()
+    function getContents()
     {
         $count = $this->to_crawl_queue->count;
         $contents = array();
@@ -373,7 +378,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function normalize($new_total = NUM_URLS_QUEUE_RAM)
+    function normalize($new_total = NUM_URLS_QUEUE_RAM)
     {
         $this->to_crawl_queue->normalize();
     }
@@ -383,7 +388,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function openUrlArchive($mode = "r")
+    function openUrlArchive($mode = "r")
     {
         return $this->to_crawl_archive->open($mode);
     }
@@ -391,7 +396,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function closeUrlArchive($fh)
+    function closeUrlArchive($fh)
     {
         $this->to_crawl_archive->close($fh);
     }
@@ -399,7 +404,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function addSeenUrlFilter($url)
+    function addSeenUrlFilter($url)
     {
         $this->url_exists_filter_bundle->add($url);
     }
@@ -407,7 +412,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function differenceSeenUrls(&$url_array, $field_name = NULL)
+    function differenceSeenUrls(&$url_array, $field_name = NULL)
     {
         $this->url_exists_filter_bundle->differenceFilter(
             $url_array, $field_name);
@@ -416,7 +421,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function addGotRobotTxtFilter($host)
+    function addGotRobotTxtFilter($host)
     {
         $this->got_robottxt_filter->add($host);
     }
@@ -424,7 +429,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function containsGotRobotTxt($host)
+    function containsGotRobotTxt($host)
     {
         return $this->got_robottxt_filter->contains($host);
     }
@@ -432,7 +437,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function addDisallowedRobotFilter($host)
+    function addDisallowedRobotFilter($host)
     {
         $this->dissallowed_robot_filter->add($host);
     }
@@ -440,7 +445,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function containsDisallowedRobot($host_path)
+    function containsDisallowedRobot($host_path)
     {
         return $this->dissallowed_robot_filter->contains($host_path);
     }
@@ -448,7 +453,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function getRobotTxtAge()
+    function getRobotTxtAge()
     {

         $creation_time = intval(
@@ -460,7 +465,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function setCrawlDelay($host, $value)
+    function setCrawlDelay($host, $value)
     {
         $this->crawl_delay_filter->add("-1".$host);
             //used to say a crawl delay has been set
@@ -476,7 +481,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function getCrawlDelay($host)
+    function getCrawlDelay($host)
     {
         if(!$this->crawl_delay_filter->contains("-1".$host)) {
             return -1;
@@ -495,7 +500,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function constructHashTable($name, $num_values)
+    function constructHashTable($name, $num_values)
     {
         $this->hash_rebuild_count = 0;
         $this->max_hash_ops_before_rebuild = floor($num_values/4);
@@ -505,7 +510,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function lookupHashTable($key)
+    function lookupHashTable($key)
     {
         return $this->to_crawl_table->lookup($key);
     }
@@ -513,7 +518,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function deleteHashTable($value)
+    function deleteHashTable($value)
     {
         $this->to_crawl_table->delete($value);
         $this->hash_rebuild_count++;
@@ -525,7 +530,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function insertHashTable($key, $value)
+    function insertHashTable($key, $value)
     {
         $this->hash_rebuild_count++;
         if($this->hash_rebuild_count > $this->max_hash_ops_before_rebuild) {
@@ -537,7 +542,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function rebuildHashTable()
+    function rebuildHashTable()
     {
         crawlLog("Rebuilding Hash table");
         $num_values = $this->to_crawl_table->num_values;
@@ -571,7 +576,7 @@ class WebQueueBundle implements Notifier
     * Since offsets are integers, even if the queue is kept relatively small,
     * periodically we will need to rebuild the archive for storing urls.
     */
-    public function rebuildUrlTable()
+    function rebuildUrlTable()
     {
         crawlLog("Rebuilding URL table");
         $dir_name = $this->dir_name;
@@ -613,7 +618,7 @@ class WebQueueBundle implements Notifier
     /**
      *
      */
-    public function emptyRobotFilters()
+    function emptyRobotFilters()
     {
         unlink($this->dir_name."/got_robottxt.ftr");
         unlink($this->dir_name."/dissallowed_robot.ftr");
diff --git a/models/phrase_model.php b/models/phrase_model.php
index e2225d2f5..9a5b40dac 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -98,9 +98,6 @@ class PhraseModel extends Model

         $phrase_string = mb_ereg_replace("[[:punct:]]", " ", $phrase);
         $phrase_string = preg_replace("/(\s)+/", " ", $phrase_string);
-
-        $phrase_hash = crawlHash($phrase_string);
-
         /*
             we search using the stemmed words, but we format snippets in the
             results by bolding either
@@ -109,11 +106,17 @@ class PhraseModel extends Model
         $words =
             array_keys(PhraseParser::extractPhrasesAndCount($phrase_string));
             //stemmed
+        if(isset($words) && count($words) == 1) {
+            $phrase_string = $words[0];
+        }
+        $phrase_hash = crawlHash($phrase_string);
+
+        $phrase_info = $index_archive->getPhraseIndexInfo($phrase_hash);

-        if($index_archive->getPhraseIndexInfo($phrase_hash) != NULL) {
+        if($phrase_info  != NULL) {

             $results = $index_archive->getSummariesByHash(
-                $phrase_hash, $low, $results_per_page);
+                $phrase_hash, $low, $results_per_page, NULL, NULL, $phrase_info);

             if(count($results) == 0) {
                 $results = NULL;
ViewGit