Completes initial documentation

Chris Pollett [2010-08-10 20:Aug:th]
Completes initial documentation
Filename
bin/queue_server.php
lib/bloom_filter_bundle.php
lib/index_archive_bundle.php
lib/priority_queue.php
lib/processors/pdf_processor.php
lib/processors/rtf_processor.php
lib/web_archive.php
lib/web_archive_bundle.php
lib/web_queue_bundle.php
diff --git a/bin/queue_server.php b/bin/queue_server.php
index abfcb19a8..40da07eb4 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -157,7 +157,7 @@ class QueueServer implements CrawlConstants
      * Makes a queue_server object with the supplied indexed_file_types
      *
      * As part of the creation process, a database manager is initialized so
-     * the queue_server cna make use of its file/folder manipulation functions.
+     * the queue_server can make use of its file/folder manipulation functions.
      */
     function __construct($indexed_file_types)
     {
@@ -190,7 +190,11 @@ class QueueServer implements CrawlConstants
     }

     /**
+     * Main runtime loop of the queue_server.
      *
+     * Loops until a stop message received, check for start, stop, resume
+     * crawl messages, deletes any WebQueueBundle for which an
+     * IndexArchiveBundle does not exist. Processes
      */
     function loop()
     {
@@ -249,9 +253,15 @@ class QueueServer implements CrawlConstants
     }

     /**
+     * Handles messages passed via files to the QueueServer.
      *
-     * @param array $info
-     * @return array
+     * These files are typically written by the CrawlDaemon::init()
+     * when QueueServer is run using command-line argument
+     *
+     * @param array $info associative array with info about current state of
+     *      queue_server
+     * @return array an updates version $info reflecting changes that occurred
+     *      during the handling of the admin messages files.
      */
     function handleAdminMessages($info)
     {
@@ -274,7 +284,7 @@ class QueueServer implements CrawlConstants
                     }
                     if(isset($this->index_archive)) {
                         $this->index_archive->forceSave();
-                        // chmod so apahce can also write to these directories
+                        // chmod so apache can also write to these directories
                         $this->db->setWorldPermissionsRecursive(
                             CRAWL_DIR.'/cache/'.
                             self::index_data_base_name.$this->crawl_time);
@@ -321,8 +331,11 @@ class QueueServer implements CrawlConstants
     }

     /**
+     * Begins crawling base on time, order, restricted site $info
+     * Setting up a crawl involves creating a queue bundle and an
+     * index archive bundle
      *
-     * @param array $info
+     * @param array $info parameter for the crawl
      */
     function startCrawl($info)
     {
@@ -378,7 +391,8 @@ class QueueServer implements CrawlConstants
     }

     /**
-     *
+     * Delete all the queue schedules in the cache that don't have an
+     * associated index bundle as this means that crawl has been deleted.
      */
     function deleteOrphanedBundles()
     {
@@ -399,9 +413,13 @@ class QueueServer implements CrawlConstants
     }

     /**
-     *
-     * @param string $base_dir
-     * @param string $callback_method
+     * Generic function used to process Data, Index, and Robot info schedules
+     * Finds the first file in the the direcotry of schedules of the given
+     * type, and calls the appropriate callback method for that type.
+     *
+     * @param string $base_dir directory for of schedules
+     * @param string $callback_method what method should be called to handle
+     *      a schedule
      */
     function processDataFile($base_dir, $callback_method)
     {
@@ -435,7 +453,9 @@ class QueueServer implements CrawlConstants
     }

     /**
-     *
+     * Sets up the directory to look for a file of unprocessed
+     * index archive data from fetchers then calls the function
+     * processDataFile to process the oldest file found
      */
     function processIndexData()
     {
@@ -448,8 +468,10 @@ class QueueServer implements CrawlConstants
     }

     /**
+     * Adds the summary and index data in $file to summary bundle and word index
      *
-     * @param string $file
+     * @param string $file containing web pages summaries and a mini-inverted
+     *      index for their content
      */
     function processIndexArchive($file)
     {
@@ -538,7 +560,9 @@ class QueueServer implements CrawlConstants
     }

     /**
-     *
+     * Checks how old the oldest robot data is and dumps if older then a
+     * threshold, then sets up the path to the robot schedule directory
+     * and tries to process a file of robots.txt robot paths data from there
      */
     function processRobotUrls()
     {
@@ -560,8 +584,11 @@ class QueueServer implements CrawlConstants
     }

     /**
-     *
-     * @param string $file
+     * Reads in $file of robot data adding host-paths to the disallowed
+     * robot filter and setting the delay in the delay filter of
+     * crawled delayed hosts
+     * @param string $file file to read of robot data, is removed after
+     *      processing
      */
     function processRobotArchive($file)
     {
@@ -602,7 +629,13 @@ class QueueServer implements CrawlConstants
     }

     /**
-     *
+     * Deletes all Robot informations stored by the QueueServer.
+     *
+     * This function is called roughly every CACHE_ROBOT_TXT_TIME.
+     * It forces the crawler to redownload robots.txt files before hosts
+     * can be continued to be crawled. This ensures if the cache robots.txt
+     * file is never too old. Thus, if someone changes it to allow or disallow
+     * the crawler it will be noticed reasonably promptly.
      */
     function deleteRobotData()
     {
@@ -612,13 +645,16 @@ class QueueServer implements CrawlConstants
             self::robot_data_base_name.$this->crawl_time;
         $this->db->unlinkRecursive($robot_schedules, true);

-        crawlLog("... reseting robot bloom filters ...");
+        crawlLog("... resetting robot bloom filters ...");
         $this->web_queue->emptyRobotFilters();
     }

     /**
+     * Checks for a new crawl file or a schedule data for the current crawl and
+     * if such a exists then processes its contents adding the relevant urls to
+     * the priority queue
      *
-     * @return array
+     * @return array info array with continue status
      */
     function processQueueUrls()
     {
@@ -631,10 +667,9 @@ class QueueServer implements CrawlConstants
         if(file_exists(CRAWL_DIR."/schedules/".self::schedule_start_name)) {
             crawlLog(
                 "Start schedule urls".CRAWL_DIR.
-                    "/schedules/".self::schedule_start_name);
-            $info = array_merge($info,
-                $this->processDataArchive(
-                    CRAWL_DIR."/schedules/".self::schedule_start_name));
+                    "/schedules/".self::schedule_start_name);
+            $this->processDataArchive(
+                CRAWL_DIR."/schedules/".self::schedule_start_name);
             return $info;
         }

@@ -649,8 +684,11 @@ class QueueServer implements CrawlConstants
     }

     /**
-     * @param string $file
-     * @return array
+     * Process a file of to-crawl urls adding to or adjusting the weight in
+     * the PriorityQueue of those which have not been seen. Also
+     * updates the queue with seen url info
+     *
+     * @param string $file containing serialized to crawl and seen url info
      */
     function processDataArchive($file)
     {
@@ -658,8 +696,6 @@ class QueueServer implements CrawlConstants

         $sites = unserialize(file_get_contents($file));

-        $info = array();
-
         if(isset($sites[self::MACHINE])) {
             $this->most_recent_fetcher = $sites[self::MACHINE];
         }
@@ -788,13 +824,12 @@ class QueueServer implements CrawlConstants
             crawlLog("URL: $url");
         }

-        return $info;
-
     }

     /**
+     * Removes the already seen urls from the supplied array
      *
-     * @param array &$sites
+     * @param array &$sites url data to check if seen
      */
     function deleteSeenUrls(&$sites)
     {
diff --git a/lib/bloom_filter_bundle.php b/lib/bloom_filter_bundle.php
index 32ff0be7b..6962e7b34 100644
--- a/lib/bloom_filter_bundle.php
+++ b/lib/bloom_filter_bundle.php
@@ -141,8 +141,8 @@ class BloomFilterBundle
     }

     /**
-     * Removes from the passed array those elements $elt who either are not in
-     * the filter bundle or whose $elt[$field_name] is not in the bundle.
+     * Removes from the passed array those elements $elt who either are in
+     * the filter bundle or whose $elt[$field_name] is in the bundle.
      *
      * @param array &$arr the array to remove elements from
      * @param string $field_name if not NULL the field name of $arr to use to
diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php
index b06780150..0b759e982 100644
--- a/lib/index_archive_bundle.php
+++ b/lib/index_archive_bundle.php
@@ -38,27 +38,19 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
  */
 require_once 'web_archive_bundle.php';
 /**
- * Filters used to check if words appear in a given generation
+ * Bloom Filter used by BloomFilterBundle
  */
 require_once 'bloom_filter_file.php';
 /**
- *
+ * Used to check if a page already stored in the WebArchiveBundle
  */
 require_once 'bloom_filter_bundle.php';
 /**
- *
- */
-require_once 'gzip_compressor.php';
-/**
- *
- */
-require_once 'non_compressor.php';
-/**
- *
+ * Used for crawlLog and crawlHash
  */
 require_once 'utility.php';
-/** Loads common constants for web crawling
- *
+/**
+ *Loads common constants for web crawling
  */
 require_once 'crawl_constants.php';

@@ -88,7 +80,7 @@ interface IndexingConstants

 /**
  * Callback function used to set the offsets into the archive file from
- * the paritcular word info in the header block of a WordArchive
+ * the particular word info in the header block of a WordArchive
  *
  * @param array $data
  * @param array $objects
@@ -125,86 +117,137 @@ function setOffsetPointers($data, &$objects, $offset_field)
 }

 /**
+ * Used to iterate through the documents associated with a word in
+ * an IndexArchiveBundle. It also makes it easy to get the summaries
+ * of these documents and restrict the documents by additional words.
+ *
+ * A description of how words and the documents containing them are stored
+ * is given in the documentation of IndexArchiveBundle. To iterate over
+ * all documents containng a word, its hash, work_key, is formed. Then using
+ * the Bloom filter for that partition, it is determined if the word is stored
+ * at all, and if it is, which generations it occurs in. Then the iterator
+ * is set to point to the first block of the first generation the word appears
+ * in that is greater than the limit of the WordIterator. Thereafter,
+ * nextDocsWithWord will advance $this->current_pointer by one per call.
+ * $this->current_pointer keeps track of which block of documents containing
+ * the word to return. If it is less than COMMON_WORD_THRESHOLD/BLOCK_SIZE and
+ * there are still more blocks, then the corresponding block_pointer of the word
+ * from the generation's partition info_block is used to look up the offset to
+ * the doc block. If it is greater than this value then the linked list
+ * of doc blocks pointed to for the partition is followed to get the appropriate
+ * block. This list is in the order that words were stored in the index so
+ * LIST_OFFSET points to the last block stored, which in turn points to the
+ * next to last block, etc. Finally, when all the blocks in the linked-list are
+ * exhausted, the remaining docs for that generation for that word are stored
+ * in the info block for the word itself (this will always be less than
+ * BLOCK_SIZE many). Once all the docs for a word for a generation have been
+ * iterated through, than iteration proceeds to the next generation containing
+ * the word.
  *
  * @author Chris Pollett
  * @package seek_quarry
  * @subpackage library
+ * @see IndexArchiveBundle
  */
 class WordIterator implements IndexingConstants, CrawlConstants
 {
     /**
-     *
+     * hash of word that the iterator iterates over
+     * @var string
      */
     var $word_key;
     /**
-     *
+     * The IndexArchiveBundle this index is associated with
+     * @var object
      */
     var $index;
     /**
-     *
+     * The number of documents already iterated over
+     * @var int
      */
     var $seen_docs;
     /**
-     *
+     * @var int
      */
     var $restricted_seen_docs;
     /**
-     *
+     * The number of documents in the current block before filtering
+     * by restricted words
+     * @var int
      */
     var $count_block_unfiltered;
     /**
-     *
+     * Estimate of the number of documents that this iterator can return
+     * @var int
      */
     var $num_docs;

     /**
-     *
+     * If iterating through the linked-list portions of the documents
+     * the next byte offset in the WebArchive based linked-list
+     * @var int
      */
     var $next_offset;
     /**
-     *
+     * Block number of the last block of docs
+     * @var int
      */
     var $last_pointed_block;
     /**
-     *
+     * @var int
      */
     var $list_offset;

     /**
-     *
+     * Pointers to offsets for blocks containing docs with the given word
+     * for the current generation
+     * @var array
      */
     var $block_pointers;
     /**
-     *
+     * Number of completely full blocks of documents for the current generation
+     * @var int
      */
     var $num_full_blocks;
     /**
-     *
+     * Number of generations word appears in
+     * @var int
      */
     var $num_generations;
     /**
-     *
+     * Used to store the contents of the last partially full block
+     * @var int
      */
     var $last_block;
     /**
-     *
+     *
+     * @var object
      */
     var $info_block;
     /**
-     *
+     * Stores the number of the current block of documents we are at in the
+     * set of all blocks of BLOCK_SIZE many documents
+     * @var int
      */
     var $current_pointer;
     /**
-     *
+     * First document that should be returned
+     * amongst all of the documents associated with the
+     * iterator's $word_key
+     * @var int
      */
     var $limit;

     /**
+     * Creates a word iterator with the given parameters.
      *
-     * @param string $word_key
-     * @param object $index
-     * @param int $limit
-     * @param object $info_block
+     * @param string $word_key hash of word or phrase to iterate docs of
+     * @param object $index the IndexArchiveBundle to use
+     * @param int $limit the first element to return from the list of docs
+     *      iterated over
+     * @param object $info_block the info block of the WebArchive
+     *      associated with the word in the index. If NULL, then this will
+     *      loaded in WordIterator::reset()
      */
     public function __construct($word_key, $index, $limit = 0, $info_block = NULL)
     {
@@ -215,8 +258,13 @@ class WordIterator implements IndexingConstants, CrawlConstants
     }

     /**
+     * Returns the iterators to the first document block that it could iterate
+     * over
      *
-     * @param object $info_block
+     * @param object $info_block the header block in the index WebArchiveBundle
+     *      for the word this iterator iterates over. If not NULL, this saves
+     *      the time to load it. If not it will be loaded, but this will be
+     *      slower.
      */
     public function reset($info_block = NULL)
     {
@@ -265,8 +313,9 @@ class WordIterator implements IndexingConstants, CrawlConstants
     }

     /**
+     * Sets up the iterator to iterate through the current generation.
      *
-     * @return bool
+     * @return bool whether the initialization succeeds
      */
     public function initGeneration()
     {
@@ -318,9 +367,11 @@ class WordIterator implements IndexingConstants, CrawlConstants
     }

     /**
-     *
-     * @param array $restrict_phrases
-     * @return array
+     * Gets the block of doc summaries associated with the current doc
+     * pointer and which match the array of additional word restrictions
+     * @param array $restrict_phrases an array of additional words or phrases
+     *      to see if contained in summary
+     * @return array doc summaries that match
      */
     public function currentDocsWithWord($restrict_phrases = NULL)
     {
@@ -442,9 +493,12 @@ class WordIterator implements IndexingConstants, CrawlConstants
     }

     /**
+     * Get the current block of doc summaries for the word iterator and advances
+     * the current pointer to the next block
      *
-     * @param array $restrict_phrases
-     * @return array
+     * @param array $restrict_phrases additional words to restrict doc summaries
+     *      returned
+     * @return array doc summaries matching the $restrict_phrases
      */
     public function nextDocsWithWord($restrict_phrases = NULL)
     {
@@ -480,6 +534,43 @@ class WordIterator implements IndexingConstants, CrawlConstants
 }

 /**
+ * Encapsulates a set of web page summaries and an inverted word-index of terms
+ * from these summaries which allow one to search for summaries containing a
+ * particular word.
+ *
+ * The basic file structures for an IndexArchiveBundle are:
+ * <ol>
+ * <li>A WebArchiveBundle for web page summaries.</li>
+ * <li>A set of WebArchiveBundles for the inverted index. Each such bundle
+ * is called a <b>generation</b>. These bundles have name index0, index1,...
+ * The file generations.txt keeps track of what is the current generation
+ * and how many words have been stored in it. A given generation can
+ * hold NUM_WORDS_PER_GENERATION words amongst all its partitions. After which
+ * the next generation begins. In a given generation, a word is stored in
+ * the partition that its hash key hashes to. The same word may appear in
+ * several generations. The info block for a partition for a particular
+ * generation contains objects for each word of the generation that hashed
+ * to that partition. Each such word object contains a count of the number
+ * of documents it occurred in for that generation. It also has an
+ * array of block_pointers to blocks of size BLOCK_SIZE. These blocks contains
+ * documents that the word occurred in, the score for the occurrence, and
+ * an offset into the summary file for that document. If the total number of
+ * documents is not a multiple of BLOCK_SIZE the remaining documents are stored
+ * directly in the word's info block object. If, in a given generation, a
+ * word occurs more than COMMON_WORD_THRESHOLD many times then the word object
+ * uses a LIST_OFFSET pointer to point to a linked list in the partition of
+ * addtional blocks of documents for that word.
+ * </li>
+ * <li>For each partition and for all generations a BloomFilterFile is used
+ * to keep track of which words appear in which generations for a
+ * particular partition. These filters are stored in a folder within the
+ * IndexArchiveBundle called index_filters. When a word and documents
+ * containing it are stored in an IndexArchiveBundle, its word_key (its has) is
+ * stored in the filter for the partition its word_key hash to. Further
+ * if the current generation is i, then work_ket concatenated with i is
+ * also stored in this same filter.</li>
+ * </ol>
+ *
  *
  * @author Chris Pollett
  * @package seek_quarry
@@ -487,24 +578,74 @@ class WordIterator implements IndexingConstants, CrawlConstants
  */
 class IndexArchiveBundle implements IndexingConstants, CrawlConstants
 {
+    /**
+     * Used to keep track of the time to perform various operations
+     * in this IndexArchiveBundle
+     * @var array
+     */
     var $diagnostics;
+    /**
+     * Folder name to use for this IndexArchiveBundle
+     * @var string
+     */
     var $dir_name;
+    /**
+     * A short text name for this IndexArchiveBundle
+     * @var string
+     */
     var $description;
+    /**
+     * Number of partitions in the summaries WebArchiveBundle
+     * @int
+     */
     var $num_partitions_summaries;
+    /**
+     * Number of partitions in the inverted word index
+     * (same for each generation)
+     * @int
+     */
     var $num_partitions_index;
+    /**
+     * structure contains info about the current generation:
+     * its index (ACTIVE), and the number of words it contains
+     * (NUM_WORDS).
+     * @array
+     */
     var $generation_info;
+    /**
+     * Number of words before a new generation is started
+     * @int
+     */
     var $num_words_per_generation;
+    /**
+     * WebArchiveBundle for web page summaries
+     * @object
+     */
     var $summaries;
+    /**
+     * WebArchiveBundle for inverted word index
+     * @object
+     */
     var $index;
+    /**
+     * Bloom Filters used to figure out which words are in which generations for
+     * given paritions
+     * @object
+     */
     var $index_partition_filters;

     /**
+     * Makes or initializes an IndexArchiveBundle with the provided parameters
      *
-     * @param string $dir_name
-     * @param int $filter_size
-     * @param int $num_partitions_summaries
-     * @param int $num_parititions_index
-     * @param string $description
+     * @param string $dir_name folder name to store this bundle
+     * @param int $filter_size size of a Bloom filter for the word index
+     *      partition filters as wells as for the page_exists_filters in
+     *      the WebArchiveBundles
+     * @param int $num_partitions_summaries number of WebArchive partitions
+     *      to use in the summmaries WebArchiveBundle
+     * @param int $num_partitions_index number of WebArchive partitions
+     *      to use in the index WebArchiveBundle
+     * @param string $description a short text name for this IndexArchiveBundle
      */
     public function __construct($dir_name, $filter_size = -1,
         $num_partitions_summaries = NULL, $num_partitions_index = NULL,
@@ -546,11 +687,14 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
     }

     /**
+     * Add the array of $pages to the summaries WebArchiveBundle pages being
+     * stored in the partition according to the $key_field and the field used
+     * to store the resulting offsets given by $offset_field.
      *
-     * @param string $key_field
-     * @param string $offset_field
-     * @param array $pages
-     * @return array
+     * @param string $key_field field used to select partition
+     * @param string $offset_field field used to record offsets after storing
+     * @param array &$pages data to store
+     * @return array $pages adjusted with offset field
      */
     public function addPages($key_field, $offset_field, $pages)
     {
@@ -560,8 +704,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
     }

     /**
+     * Adds the provided mini inverted index data to the IndexArchiveBundle
      *
-     * @param array $index_data
+     * @param array $index_data a mini inverted index of word_key=>doc data
+     *      to add to this IndexArchiveBundle
      */
     public function addIndexData($index_data)
     {
@@ -614,10 +760,18 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
     }

     /**
+     * Adds the mini-inverted index data that to a particular partition.
+     * It is assume the word keys in this data would hash to the destined
+     * index partitions
      *
-     * @param int $partition
-     * @param array &$word_data
-     * @param bool $overwrite
+     * @param int $partition WebArchive in the index WebArchiveBundle of the
+     *      current generation to write to
+     * @param array &$word_data what to wrtie
+     * @param bool $overwrite whether to signal that all data in prior
+     * generations associated with keys that are being inserted should be
+     * ignored (for instance, multi-word search are partially computed and
+     * added to the index. If these get recomputed we might want to ignore
+     * prior work. )
      */
     public function addPartitionWordData($partition,
         &$word_data, $overwrite = false)
@@ -733,10 +887,11 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
     }

     /**
+     * Adds the provided $word_key to the BloomFilter for the given partition
      *
-     * @param int $partition
-     * @param string $word_key
-     * @return bool
+     * @param int $partition whose Bloom Filter we want to add the word_key to
+     * @param string $word_key the key to add
+     * @return bool whether the add was successful
      */
     public function addPartitionIndexFilter($partition, $word_key)
     {
@@ -752,9 +907,9 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
     }

     /**
-     *
-     * @param int $partition
-     * @return bool
+     * Initializes or constructs the Bloom filter assocaited with a partition
+     * @param int $partition index of desired partition
+     * @return bool whether the operation was successful
      */
     public function initPartitionIndexFilter($partition)
     {
@@ -777,14 +932,18 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
     }

     /**
-     *
-     * @param string $word_key
-     * @param int $Lint
-     * @param int $num
-     * @param array $restrict_phrases
-     * @param string $phrase_key
-     * @param array $phrase_info
-     * @return array
+     * Gets doc summaries of documents containing a given word and meeting the
+     * additional provided criteria
+     * @param string $word_key the word to iterate over to get document results
+     *      of
+     * @param int $limit number of first document in order to return
+     * @param int $num number of documents to return summaries of
+     * @param array $restrict_phrases additional words and phrase to store
+     *      further restrict the search
+     * @param string $phrase_key a hash of the word and restricted phrases to
+     *      store the results of the look up
+     * @param array $phrase_info info block of the word
+     * @return array document summaries
      */
     public function getSummariesByHash($word_key, $limit, $num,
         $restrict_phrases = NULL, $phrase_key = NULL, $phrase_info = NULL)
@@ -831,10 +990,14 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
     }

     /**
+     * Gets the page out of the summaries WebArchiveBundle with the given
+     * key and offset
      *
-     * @param string $key
-     * @param int $offset
-     * @return array
+     * The $key determines the partition WebArchive, the $offset give the
+     * byte offset within that archive.
+     * @param string $key hash to use to look up WebArchive partition
+     * @param int $offset byte offset in partition of desired page
+     * @return array desired page
      */
     public function getPage($key, $offset)
     {
@@ -842,11 +1005,16 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
     }

     /**
+     * Returns a block of documents a word occur in. The doc block looked up
+     * is at a given offset into the word's partition WebArchive for a given
+     * generation. This is used when the word occurs more the
+     * COMMON_WORD_THRESHOLD many times in a generation
      *
-     * @param string $word_key
-     * @param int $offset
-     * @param int $generation
-     * @return array
+     * @param string $word_key hash of word whose doc block we are looking up
+     * @param int $offset byte offset into word's partition WebArchive for the
+     *      supplied generation
+     * @param int $generation which generation to look up the doc block of
+     * @return array the desired doc block
      */
     public function getWordDocBlock($word_key, $offset, $generation = -1)
     {
@@ -860,11 +1028,14 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
     }

     /**
+     * Gets a page using in WebArchive $partition of the word index
+     * using the provided byte $offset and using existing $file_handle
+     * if possible.
      *
-     * @param int $partition
-     * @param int $offset
-     * @param resource $file_handle
-     * @return array
+     * @param int $partition which WebArchive to look in
+     * @param int $offset byte offset of page data
+     * @param resource $file_handle file handle resource of $partition archive
+     * @return array desired page
      */
     public function getPageByPartition($partition, $offset, $file_handle = NULL)
     {
@@ -873,9 +1044,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
     }

     /**
+     * Adds the given summary to the summary exists filter bundle
      *
-     * @param string $key_field
-     * @param array $page
+     * @param string $key_field field of page with hash of page content
+     * @param array $page summary of page
      */
     public function addPageFilter($key_field, $page)
     {
@@ -883,7 +1055,13 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
     }

     /**
+     * Looks at the $field key of elements of pages and computes an array
+     * consisting of $field values which are not in
+     * the page_exists_filter_bundle of the summaries bundle
      *
+     * @param array $pages set of page data to start from
+     * @param string $key_field field to check against filter bundle
+     * @return mixed false if filter empty; desired array otherwise
      */
     public function differenceContainsPages(&$page_array, $field_name = NULL)
     {
@@ -892,13 +1070,14 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
     }

     /**
-     *
+     * Forces the data in the page exists filter bundle of summaries
+     * to be save to disk, forces each index partition summary to be saved
      */
     public function forceSave()
     {
         $this->summaries->forceSave();
         for($i = 0; $i < $this->num_partitions_index; $i++) {
-            if($this->index_partition_filters[$i] &&
+            if(isset($this->index_partition_filters[$i]) &&
                 $this->index_partition_filters[$i] != NULL) {
                 $this->index_partition_filters[$i]->save();
             }
@@ -906,11 +1085,16 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
     }

     /**
+     * Computes statistics for the provided phrase_key.
+     * These include an estimate of the total number of documents it occurs in,
+     * as well as which generations it occurs in, and what are its info block
+     * looks like in the current generation
      *
-     * @param string $phrase_key
-     * @param int $generation_index
-     * @param array $info_block
-     * @return array
+     * @param string $phrase_key what to compute statistics for
+     * @param int $generation_index the current generation
+     * @param array $info_block info_block of the phrase_key (will look up
+     *      if not provided)
+     * @return array info for this $phrase_key
      */
     public function getPhraseIndexInfo(
         $phrase_key, $generation_index = 0, $info_block = NULL)
@@ -986,6 +1170,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
     }

     /**
+     * Sets the information associated with a word in the inverted index
      *
      * @param string $phrase_key
      * @param array $info
@@ -1008,11 +1193,17 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
     }

     /**
+     * Adds the supplied phrase to the IndexArchiveBundle.
      *
-     * @param string $word_key
-     * @param array $restrict_phrases
-     * @param string $phrase_key
-     * @param $num_needed
+     * The most selective word in the phrase is $word_key, the additional
+     * words are in $restrict_phrases, the hash of the phrase to add is
+     * $phrase_key, and if the will be a lot of results compute at least
+     * the first $num_needed.
+     *
+     * @param string $word_key hash of most selective word in phrase
+     * @param array $restrict_phrases additional words in phrase
+     * @param string $phrase_key hash of phrase to add
+     * @param $num_needed minimum number of doc results to save if possible
      */
     public function addPhraseIndex($word_key, $restrict_phrases,
         $phrase_key, $num_needed)
@@ -1082,11 +1273,12 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
     }

     /**
+     * Computes the words which appear in the fewest or most documents
      *
-     * @param array $word_keys
-     * @param int $num
-     * @param string $comparison
-     * @return array
+     * @param array $word_keys keys of words to select amongst
+     * @param int $num number of words from the above set to return
+     * @param string $comparison callback function name for how to compare words
+     * @return array the $num most documents or $num least document words
      */
     public function getSelectiveWords($word_keys, $num, $comparison="lessThan")
         //lessThan is in utility.php
@@ -1109,10 +1301,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
     }

     /**
+     * Reads the info block of $partition index WebArchive
      *
-     * @param int $partition
-     * @param int $generation
-     * @return array
+     * @param int $partition WebArchive to read from
+     * @return array data in its info block
      */
     public function readPartitionInfoBlock($partition, $generation = -1)
     {
@@ -1127,9 +1319,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
     }

     /**
+     * Write $data into the info block of the $partition index WebArchive
      *
-     * @param int $partition
-     * @param array $data
+     * @param int $partition WebArchive to write into
+     * @param array $data what to write
      */
     public function writePartitionInfoBlock($partition, $data)
     {
@@ -1137,7 +1330,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
     }

     /**
-     * Gets teh description, count of summaries, and number of partions of the
+     * Gets the description, count of summaries, and number of partions of the
      * summaries store in the supplied directory
      *
      * @param string path to a directory containing a summaries WebArchiveBundle
diff --git a/lib/priority_queue.php b/lib/priority_queue.php
index 763863c3d..d389a58b0 100755
--- a/lib/priority_queue.php
+++ b/lib/priority_queue.php
@@ -52,8 +52,9 @@ require_once "crawl_constants.php";

 /**
  *
- * Code used to manage a memory efficient priority queue
- * Weights for the queue must be flaots
+ * Code used to manage a memory efficient priority queue.
+ * Weights for the queue must be flaots. The queue itself is
+ * implemented using heaps
  *
  * @author Chris Pollett
  *
@@ -97,7 +98,8 @@ class PriorityQueue extends StringArray implements CrawlConstants
     var $notifier; // who to call if move an item in queue

     /**
-     * Makes a priority queue with the given operating parameters
+     * Makes a priority queue (implemented as an array heap) with the given
+     * operating parameters
      *
      * @param string $fname filename to store the data associated with the queue
      * @param int $num_values number of values the queue can hold
@@ -129,7 +131,7 @@ class PriorityQueue extends StringArray implements CrawlConstants
      * Gets the data stored at the ith location in the priority queue
      *
      * @param int $i location to return data from
-     * @return mixed data if the value of $i is between 1 and count, false
+     * @return mixed array data if the value of $i is between 1 and count, false
      *      otherwise
      */
     function peek($i = 1)
@@ -142,9 +144,16 @@ class PriorityQueue extends StringArray implements CrawlConstants
     }

     /**
+     * Removes and returns the ith element out of the Priority queue.
+     * Since this is a priority queue the first element in the queue
+     * will either be the min or max (depending on queue type) element
+     * stored. If $i is not in range an error message is written to the log.
+     * This operation also performs a check to see if the queue should be
+     * saved to disk
      *
-     * @param int $i
-     * @return mixed
+     * @param int $i element to get out of the queue
+     * @return mixed array data if the value of $i is between 1 and count, false
+     *      otherwise
      */
     function poll($i = 1)
     {
@@ -165,9 +174,12 @@ class PriorityQueue extends StringArray implements CrawlConstants
     }

     /**
-     * @param string $data
-     * @param float $weight
-     * @return mixed
+     * Inserts a new item into the priority queue.
+     *
+     * @param string $data what to insert into the queue
+     * @param float $weight how much the new data should be weighted
+     * @return mixed index location in queue where item was stored if
+     *      successful, otherwise false.
      */
     function insert($data, $weight)
     {
@@ -185,9 +197,11 @@ class PriorityQueue extends StringArray implements CrawlConstants
     }

     /**
+     * Add $delta to the $ith element in the priority queue and then adjusts
+     * the queue to store the heap property
      *
-     * @param int $i
-     * @param float $delta
+     * @param int $i element whose weight should be adjusted
+     * @param float $delta how much to change the weight by
      */
     function adjustWeight($i, $delta)
     {
@@ -217,7 +231,8 @@ class PriorityQueue extends StringArray implements CrawlConstants
     }

     /**
-     *
+     * Pretty prints the contents of the queue viewed as an array.
+     *
      */
     function printContents()
     {
@@ -228,8 +243,10 @@ class PriorityQueue extends StringArray implements CrawlConstants
     }

     /**
+     * Return the contents of the priority queue as an array of
+     * value weight pairs.
      *
-     * @return array
+     * @return array contents of the queue
      */
     function getContents()
     {
@@ -242,8 +259,14 @@ class PriorityQueue extends StringArray implements CrawlConstants
     }

     /**
+     * Scaless the weights of elements in the queue so that the sum fo the new
+     * weights is $new_total
      *
-     * @param int $new_total
+     * This function is used periodically to prevent the queue from being
+     * gummed up because all of the weights stored in it are too small.
+     *
+     * @param int $new_total what the new sum of weights of elements in the
+     *      queue will be after normalization
      */
     function normalize($new_total = NUM_URLS_QUEUE_RAM)
     {
@@ -267,9 +290,13 @@ class PriorityQueue extends StringArray implements CrawlConstants
     }

     /**
+     * If the $ith element in the PriorityQueue violates the heap
+     * property with its parent node (children should be of lower
+     * priority than the parent), this function
+     * tries modify the heap to restore the heap property.
      *
-     * @param int $i
-     * @return int
+     * @param int $i node to consider in restoring the heap property
+     * @return int final position $ith node ends up at
      */
     function percolateUp($i)
     {
@@ -297,9 +324,12 @@ class PriorityQueue extends StringArray implements CrawlConstants
     }

     /**
+     * If the ith element in the PriorityQueue violates the heap
+     * property with some child node (children should be of lower
+     * priority than the parent), this function
+     * tries modify the heap to restore the heap property.
      *
-     * @param int $i
-     * @return int
+     * @param int $i node to consider in restoring the heap property
      */
     function percolateDown($i)
     {
@@ -339,10 +369,14 @@ class PriorityQueue extends StringArray implements CrawlConstants
     }

     /**
+     * Computes the difference of the two values $value1 and $value2
+     *
+     * Which is subtracted from which is determined by whether this is
+     * a min_or_max priority queue
      *
-     * @param float $value1
-     * @param float $value2
-     * @return float
+     * @param float $value1 a value to take the difference between
+     * @param float $value2 the other value
+     * @return float the differences
      */
     function compare($value1, $value2)
     {
@@ -354,10 +388,11 @@ class PriorityQueue extends StringArray implements CrawlConstants
     }

     /**
+     * Gets the ith element of the PriorityQueue viewed as an array
      *
-     *
-     * @param int $i
-     * @return array
+     * @param int $i element to get
+     * @return array value stored in queue together with its weight as a two
+     *      element array
      */
     function getRow($i)
     {
diff --git a/lib/processors/pdf_processor.php b/lib/processors/pdf_processor.php
index b8e6fb415..fa1c61b5d 100755
--- a/lib/processors/pdf_processor.php
+++ b/lib/processors/pdf_processor.php
@@ -144,9 +144,11 @@ class PdfProcessor extends TextProcessor
     }

     /**
+     * Checks if the PDF object's object dictionary is in a list of types
      *
-     * @param string $object_dictionary
-     * @param array $type_array
+     * @param string $object_dictionary the object dictionary to check
+     * @param array $type_array the list of types to check against
+     * @return whether it is in or not
      */
     static function objectDictionaryHas($object_dictionary, $type_array)
     {
@@ -160,9 +162,9 @@ class PdfProcessor extends TextProcessor
     }

     /**
-     *
-     * @param string $object_string
-     * @return string
+     * Gets the object dictionary portion of the current PDF object
+     * @param string $object_string represents the contents of a PDF object
+     * @return string the object dictionary for the object
      */
     static function getObjectDictionary($object_string)
     {
@@ -172,9 +174,10 @@ class PdfProcessor extends TextProcessor
     }

     /**
+     * Gets the object stream portion of the current PDF object
      *
-     * @param string $object_stream
-     * @return string
+     * @param string $object_stream represents the contents of a PDF object
+     * @return string the object stream for the object
      */
     static function getObjectStream($object_string)
     {
@@ -185,9 +188,12 @@ class PdfProcessor extends TextProcessor
     }

     /**
+     * Extracts ASCII text from PDF data, getting rid of non printable data,
+     * square brackets and parenthesis and converting char codes to their
+     * values.
      *
-     * @param string $data
-     * @return string
+     * @param string $data source to extract character data from
+     * @return string extracted text
      */
     static function parseText($data)
     {
@@ -227,10 +233,12 @@ class PdfProcessor extends TextProcessor
     }

     /**
+     * Extracts ASCII text till the next close brackets
      *
-     * @param string $data
-     * @param int $cur_pos
-     * @return array
+     * @param string $data source to extract character data from
+     * @param int $cur_pos position to start in $data
+     * @return array pair consisting of the final position in $data as well
+     *      as extracted text
      */
     static function parseBrackets($data, $cur_pos)
     {
@@ -268,10 +276,12 @@ class PdfProcessor extends TextProcessor
     }

     /**
+     * Extracts ASCII text till the next close parenthesis
      *
-     * @param string $data
-     * @param int $cur_pos
-     * @return array
+     * @param string $data source to extract character data from
+     * @param int $cur_pos position to start in $data
+     * @return array pair consisting of the final position in $data as well
+     *      as extracted text
      */
     static function parseParentheses($data, $cur_pos)
     {
@@ -302,7 +312,7 @@ class PdfProcessor extends TextProcessor
         }

         $check_positioning = substr($data, $cur_pos, 4);
-        if(preg_match("/\-\d{3}/", $check_positioning) >0 ) {
+        if(preg_match("/\-\d{3}/", $check_positioning) > 0 ) {
             $out .= " ";
         }

diff --git a/lib/processors/rtf_processor.php b/lib/processors/rtf_processor.php
index 96994dfb7..3430d588b 100755
--- a/lib/processors/rtf_processor.php
+++ b/lib/processors/rtf_processor.php
@@ -50,9 +50,15 @@ class RtfProcessor extends TextProcessor
 {

     /**
-     * @param string $page
-     * @param string $url
-     * @return array
+     * Computes a summary based on a rtf string of a document
+     *
+     * @param string $page rtf string of a document
+     * @param string $url location the document came from, not used by
+     *      RTFProcessor at this point. Some of its subclasses override
+     *      this method and use url to produce complete links for
+     *      relative links within a document
+     * @return array a summary of (title, description,links, and content) of
+     *      the information in $page
      */
     public static function process($page, $url)
     {
@@ -72,9 +78,13 @@ class RtfProcessor extends TextProcessor
     }

     /**
+     * Gets plain text out of an rtf string
+     *
+     * Plain text is mainly extracted by getText(), this function does
+     * some pre and post processing of escape braces and stuff
      *
-     * @param string $rtf_string
-     * @return string
+     * @param string $rtf_string what to extract plain text out of
+     * @return string plain texts
      */
     static function extractText($rtf_string) {
         $rtf_string = preg_replace('/\\\{/',"!ZZBL!", $rtf_string);
@@ -92,9 +102,10 @@ class RtfProcessor extends TextProcessor
     }

     /**
+     * Gets plain text out of an rtf string
      *
-     * @param string $rtf_string
-     * @return string
+     * @param string $rtf_string what to extract plain text out of
+     * @return string plain texts
      */
     static function getText($rtf_string)
     {
@@ -130,10 +141,11 @@ class RtfProcessor extends TextProcessor
     }

     /**
+     * Gets the contents of the rtf group at the current position in the string
      *
-     * @param string $rtf_string
-     * @param int $cur_pos
-     * @return string
+     * @param string $rtf_string data to get rtf group from
+     * @param int $cur_pos position in $rtf_string at which to get  group
+     * @return string contents of rtf groups
      */
     static function getNextObject($rtf_string, $cur_pos)
     {
diff --git a/lib/web_archive.php b/lib/web_archive.php
index 85ffea504..1bb1a2dd7 100755
--- a/lib/web_archive.php
+++ b/lib/web_archive.php
@@ -51,39 +51,36 @@ class WebArchive
 {

     /**
-     *
+     * Filename used to store the web archive.
+     * @var string
      */
     var $filename;
     /**
      *
+     * Current offset into the web archive the iterator for the archive is at
+     * (at most one iterator / archive -- oh well)
+     * @var int
      */
     var $iterator_pos;
     /**
-     *
+     * Filter object used to compress/uncompress objects stored in archive
+     * @var object
      */
     var $compressor;
     /**
-     *
+     * number of item in archive
+     * @var int
      */
     var $count;

     /**
+     * Makes or initializes a WebArchive object using the supplied parameters
      *
-     */
-    const OPEN_AND_CLOSE = 1;
-    /**
-     *
-     */
-    const OPEN = 2;
-    /**
-     *
-     */
-    const CLOSE = 3;
-    /**
-     *
-     * @param string $fname
-     * @param string $compressor
-     * @param bool $fast_construct
+     * @param string $fname filename to use to store archive to disk
+     * @param string $compressor what kind of Compressor object should be
+     *      used to read and write objects in the archive
+     * @param bool $fast_construct do we read the info block of the web
+     *      archive as part of the constructing process
      */
     function __construct($fname, $compressor, $fast_construct = false)
     {
@@ -104,8 +101,11 @@ class WebArchive
     }

     /**
-     *
-     * @return array
+     * Read the info block associated with this web archive.
+     * The info block is meta data for the archive stored at the end of
+     * the WebArchive file. The particular meta is up to who is using
+     * the web archive.
+     * @return array the contents of the info block
      */
     function readInfoBlock()
     {
@@ -123,9 +123,15 @@ class WebArchive
     }

     /**
+     * Serializes and applies the compressor to an info block and write it at
+     * the end of the web archive
+     * The info block is meta data for the archive stored at the end of
+     * the WebArchive file. The particular meta is up to who is using
+     * the web archive.
      *
-     * @param resource $fh
-     * @param array &$data
+     * @param resource $fh resource for the web archive file. If null
+     *      the web archive is open first and close when the data is written
+     * @param array &$data data to write into the info block of the archive
      */
     function writeInfoBlock($fh = NULL, &$data = NULL)
     {
@@ -153,9 +159,12 @@ class WebArchive
     }

     /**
+     * Seeks in the WebArchive file to the end of the last Object.
+     *
+     * The last 4 bytes of a WebArchive say the length of an info block in bytes
      *
-     * @param resource $fh
-     * @return int
+     * @param resource $fh resource for the WebArchive file
+     * @return int offset length of info block
      */
     function seekEndObjects($fh)
     {
@@ -168,13 +177,20 @@ class WebArchive
     }

     /**
+     * Adds objects to the WebArchive
      *
-     * @param string $offset_field
-     * @param array &$objects
-     * @param array $data
-     * @param string $callback
-     * @param bool $return_flag
-     * @return mixed
+     * @param string $offset_field field in objects to return the byte offset
+     *      at which they were stored
+     * @param array &$objects references to objects that will be stored
+     *      the offset field in these references will be adjusted if
+     * @param array $data data to write in the WebArchive's info block
+     * @param string $callback name of a callback
+     *      $callback($data, $new_objects, $offset_field)
+     *      used to modify $data before it is written
+     *      to the info block. For instance, we can add offset info to data.
+     * @param bool $return_flag if true rather than adjust the offsets by
+     *      reference, create copy objects and adjust their offsets anf return
+     * @return mixed adjusted objects or void
      */
     function addObjects($offset_field, &$objects,
         $data = NULL, $callback = NULL, $return_flag = true)
@@ -224,9 +240,10 @@ class WebArchive
     }

     /**
+     * Open the web archive file associated with this WebArchive object.
      *
-     * @param string $mode
-     * @return resource
+     * @param string $mode read/write mode to open file with
+     * @return resource a file resource for the web archive
      */
     function open($mode = "r")
     {
@@ -243,12 +260,17 @@ class WebArchive
     }

     /**
+     * Gets $num many objects out of the web archive starting at byte $offset
+     *
+     * If the $next_flag is true the archive iterator is advance and if $fh
+     * is not NULL then it is assumed to be an open resource pointing to the
+     * archive (saving the time to open it).
      *
-     * @param int $offset
-     * @param int $num
-     * @param bool $next_flag
-     * @param resource $fh
-     * @return array
+     * @param int $offset a valid byte offset into a web archive
+     * @param int $num number of objects tot return
+     * @param bool $next_flag whether to advance the archive iterator
+     * @param resource $fh either NULL or a file resource to the archive
+     * @return array the $num objects beginning at $offset
      */
     function getObjects($offset, $num, $next_flag = true, $fh = NULL)
     {
diff --git a/lib/web_archive_bundle.php b/lib/web_archive_bundle.php
index 9f77ca1e8..4f56f6b77 100755
--- a/lib/web_archive_bundle.php
+++ b/lib/web_archive_bundle.php
@@ -34,20 +34,20 @@
 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

 /**
- * A WebArchiveBundle is a collection of WebArchive, so load definition of
+ * A WebArchiveBundle is a collection of WebArchive's, so load definition of
  * web archive
  */
 require_once 'web_archive.php';
 /**
- *
+ * Bloom Filter used by BloomFilterBundle
  */
 require_once 'bloom_filter_file.php';
 /**
- *
+ * Used to check if a page already stored in the WebArchiveBundle
  */
 require_once 'bloom_filter_bundle.php';
 /**
- *
+ * Used to compress data stored in WebArchiveBundle
  */
 require_once 'gzip_compressor.php';

@@ -58,7 +58,7 @@ require_once 'gzip_compressor.php';
  * together.It is useful to split data across several archive files rather than
  * just store it in one, for both read efficiency and to keep filesizes from
  * getting too big. In some places we are using 4 byte int's to store file
- * offset which restricts the size of the files we can use for wbe archives.
+ * offsets which restricts the size of the files we can use for wbe archives.
  *
  * @author Chris Pollett
  *
@@ -69,45 +69,61 @@ class WebArchiveBundle
 {

     /**
-     *
+     * Folder name to use for this WebArchiveBundle
+     * @var string
      */
     var $dir_name;
     /**
-     *
+     * Maximum allowed insert into each BloomFilterFile in the
+     * page_exists_filter_bundle
+     * @var int
      */
     var $filter_size;
     /**
-     *
+     * Used to contain the WebArchive paritions of the bundle
+     * @var array
      */
     var $partition = array();
     /**
-     *
+     * BloomFilterBundle used to keep track of which pages are already in
+     * WebArchiveBundle
+     * @var object
      */
     var $page_exists_filter_bundle;
     /**
-     *
+     * Number of WebArchives in the WebArchiveBundle
+     * @var int
      */
     var $num_partitions;
     /**
-     *
+     * Total number of page objects stored by this WebArchiveBundle
+     * @var int
      */
     var $count;
     /**
-     *
+     * A short text name for this WebArchiveBundle
+     * @var string
      */
     var $description;
     /**
-     *
+     * How Compressor object used to compress/uncompress data stored in
+     * the bundle
+     * @var object
      */
     var $compressor;

     /**
+     * Makes or initializes an existing WebArchiveBundle with the given
+     * characteristics
      *
-     * @param string $dir_name
-     * @param int $filter_size
-     * @param int $num_partitions
-     * @param string $description
-     * @param string $compressor
+     * @param string $dir_name folder name of the bundle
+     * @param int $filter_size number of items that can be stored in
+     *      a given BloomFilterFile in the $page_exists_filter_bundle
+     * @param int $num_partitions number of WebArchive's in this bundle
+     * @param string $description a short text name/description of this
+     *      WebArchiveBundle
+     * @param string $compressor the Compressor object used to
+     *      compress/uncompress data stored in the bundle
      */
     function __construct($dir_name, $filter_size = -1,
         $num_partitions = NULL, $description = NULL,
@@ -176,11 +192,14 @@ class WebArchiveBundle
     }

     /**
+     * Add the array of $pages to the WebArchiveBundle pages being stored in
+     * the partition according to the $key_field and the field used to store
+     * the resulting offsets given by $offset_field.
      *
-     * @param string $key_field
-     * @param string $offset_field
-     * @param array &$pages
-     * @return array
+     * @param string $key_field field used to select partition
+     * @param string $offset_field field used to record offsets after storing
+     * @param array &$pages data to store
+     * @return array $pages adjusted with offset field
      */
     function addPages($key_field, $offset_field, &$pages)
     {
@@ -222,10 +241,13 @@ class WebArchiveBundle
     }

     /**
+     * Gets the page out of the WebArchiveBundle with the given key and offset
      *
-     * @param string $key
-     * @param int $offset
-     * @return array
+     * The $key determines the partition WebArchive, the $offset give the
+     * byte offset within that archive.
+     * @param string $key hash to use to look up WebArchive partition
+     * @param int $offset byte offset in partition of desired page
+     * @return array desired page
      */
     function getPage($key, $offset)
     {
@@ -236,11 +258,13 @@ class WebArchiveBundle
     }

     /**
+     * Gets a page using in WebArchive $partition using the provided byte
+     * $offset and using existing $file_handle if possible.
      *
-     * @param int $partition
-     * @param int $offset
-     * @param resource $file_handle
-     * @return array
+     * @param int $partition which WebArchive to look in
+     * @param int $offset byte offset of page data
+     * @param resource $file_handle file handle resource of $partition archive
+     * @return array desired page
      */
     function getPageByPartition($partition, $offset, $file_handle = NULL)
     {
@@ -256,10 +280,11 @@ class WebArchiveBundle
     }

     /**
+     * Adds the given page to the page exists filter bundle
      *
-     * @param string $key_field
-     * @param array &$page
-     * @return bool
+     * @param string $key_field field of page with hash of page content
+     * @param array &$page contents/summary of page
+     * @return bool whether the add succeeded
      */
     function addPageFilter($key_field, &$page)
     {
@@ -272,14 +297,17 @@ class WebArchiveBundle
     }

     /**
+     * Adds a list of objects to a given WebArchive partition
      *
-     * @param string $offset_field
-     * @param int $partition
-     * @param array &$objects
-     * @param array $data
-     * @param string $callback
-     * @param bool $return_flag
-     * @return mixed
+     * @param string $offset_field field used to store offsets after the
+     *      addition
+     * @param int $partition WebArchive index to store data into
+     * @param array &$objects objects to store
+     * @param array $data info header data to write
+     * @param string $callback function name of function to call as each
+     *      object is stored. Can be used to save offset into $data
+     * @param bool $return_flag whether to return modified $objects or not
+     * @return mixed adjusted objects or void
      */
     function addObjectsPartition($offset_field, $partition,
         &$objects, $data = NULL, $callback = NULL, $return_flag = true)
@@ -292,9 +320,10 @@ class WebArchiveBundle
     }

     /**
+     * Reads the info block of $partition WebArchive
      *
-     * @param int $partition
-     * @return array
+     * @param int $partition WebArchive to read from
+     * @return array data in its info block
      */
     function readPartitionInfoBlock($partition)
     {
@@ -302,9 +331,10 @@ class WebArchiveBundle
     }

     /**
+     * Write $data into the info block of the $partition WebArchive
      *
-     * @param int $partition
-     * @param array $data
+     * @param int $partition WebArchive to write into
+     * @param array $data what to write
      */
     function writePartitionInfoBlock($partition, &$data)
     {
@@ -312,10 +342,13 @@ class WebArchiveBundle
     }

     /**
+     * Looks at the $key_field key of elements of pages and computes an array
+     * consisting of $key_field values which are not in
+     * the page_exists_filter_bundle
      *
-     * @param array $pages
-     * @param string $key_field
-     * @return mixed
+     * @param array $pages set of page data to start from
+     * @param string $key_field field to check against filter bundle
+     * @return mixed false if filter empty; desired array otherwise
      */
     function differencePageKeysFilter($pages, $key_field)
     {
@@ -334,9 +367,11 @@ class WebArchiveBundle
     }

     /**
+     * Looks at the field_name key of elements of page_array and removes any
+     * of these which are in the page_exists_filter_bundle
      *
-     * @param array &$page_array
-     * @param string $field_name
+     * @param array &$page_array array to element remove elements from
+     * @param string $field_name field to check against filter bundle
      */
     function differencePagesFilter(&$page_array, $field_name = NULL)
     {
@@ -345,7 +380,7 @@ class WebArchiveBundle
     }

     /**
-     *
+     * Forces the data in the page exists filter bundle to be save to disk
      */
     function forceSave()
     {
@@ -355,9 +390,14 @@ class WebArchiveBundle
     }

     /**
+     * Gets an object encapsulating the $index th WebArchive partition in
+     * this bundle.
      *
-     * @param int $index
-     * @param bool $fast_construct
+     * @param int $index the number of the partition within this bundle to
+     *      return
+     * @param bool $fast_construct should the constructor of the WebArchive
+     *      avoid reading in its info block.
+     * @return object the WebArchive file which was requested
      */
     function getPartition($index, $fast_construct = true)
     {
@@ -379,8 +419,10 @@ class WebArchiveBundle
     }

     /**
+     * Updates the description file with the current count for the number of
+     * items in the WebArchiveBundle
      *
-     * @param int $num
+     * @param int $num number of items to add to current count
      */
     function addCount($num)
     {
@@ -391,9 +433,14 @@ class WebArchiveBundle
     }

     /**
+     * Gets information about a WebArchiveBundle out of its description.txt
+     * file
      *
-     * @param string $dir_name
-     * @return array
+     * @param string $dir_name folder name of the WebArchiveBundle to get info
+     *  for
+     * @return array containing the name (description) of the WebArchiveBundle,
+     *      the number of items stored in it, and the number of WebArchive
+     *      file partitions it uses.
      */
     static function getArchiveInfo($dir_name)
     {
@@ -413,10 +460,12 @@ class WebArchiveBundle
     }

     /**
+     * Hashes $value to a WebArchive partition  it should be read/written to,
+     * if a bundle has $num_partitions partitions.
      *
-     * @param string $value
-     * @param int $num_partitions
-     * @return int
+     * @param string $value item to hash
+     * @param int $num_partitions number of partitions
+     * @return int which partition $value should be written to/read from
      */
     static function selectPartition($value, $num_partitions)
     {
diff --git a/lib/web_queue_bundle.php b/lib/web_queue_bundle.php
index 20eb54ef3..efb06bd13 100755
--- a/lib/web_queue_bundle.php
+++ b/lib/web_queue_bundle.php
@@ -64,8 +64,14 @@ require_once 'web_archive.php';
 require_once 'utility.php';

 /**
- * Encapsulates the data structures needed to have a queue of urls to crawl
- * next
+ * Encapsulates the data structures needed to have a queue of to crawl urls
+ *
+ * <pre>
+ * (hash of url, weights) are stored in a PriorityQueue,
+ * (hash of url, index in PriorityQueue, offset of url in WebArchive) is stored
+ * in a HashTable
+ * urls are stored in a WebArchive in an uncompressed format
+ * </pre>
  *
  * @author Chris Pollett
  *
@@ -81,74 +87,93 @@ class WebQueueBundle implements Notifier
      */
     var $dir_name;
     /**
-     *
+     * Number items that can be stored in a partition of the page exists filter
+     * bundle
      * @var int
      */
     var $filter_size;
     /**
-     *
+     * number of entries the priority queue used by this web queue bundle
+     * can store
      * @var int
      */
     var $num_urls_ram;
     /**
-     *
+     * whether polling the first element of the priority queue returns the
+     * smallest or largest weighted element. This is set to a constant specified
+     * in PriorityQueue
      * @var int
      */
     var $min_or_max;
     /**
-     *
+     * the PriorityQueue used by this WebQueueBundle
      * @var object
      */
     var $to_crawl_queue;
     /**
-     *
+     * the HashTable used by this WebQueueBundle
      * @var object
      */
     var $to_crawl_table;
     /**
-     *
+     * Current count of the number of non-read operation performed on the
+     * WebQueueBundles's hash table since the last time it was rebuilt.
      * @var int
      */
     var $hash_rebuild_count;
     /**
-     *
+     * Number of non-read operations on the hash table before it needs to be
+     * rebuilt.
      * @var int
      */
     var $max_hash_ops_before_rebuild;
     /**
-     *
+     * WebArchive used to store urls that are to be crawled
      * @var object
      */
     var $to_crawl_archive;

+    /**
+     * BloomFilter used to keep track of which urls we've already seen
+     * @var object
+     */
     var $url_exists_filter_bundle;
     /**
-     *
+     * BloomFilter used to store which hosts whose robots.txt file we
+     * have already download
      * @var object
      */
     var $got_robottxt_filter;
     /**
-     *
+     * BloomFilter used to store dissallowed to crawl host paths
      * @var object
      */
     var $dissallowed_robot_filter;
     /**
-     *
+     * BloomFilter used to keep track of crawl delay in seconds for a given
+     * host
      * @var object
      */
     var $crawl_delay_filter;

     /**
-     *
+     * The largest offset for the url WebArchive before we rebuild it.
+     * Entries are never deleted from the url WebArchive even if they are
+     * deleted from the priority queue. So when we pass this value we
+     * make a new WebArchive containing only those urls which are still in
+     * the queue.
      */
     const max_url_archive_offset = 1000000000;

     /**
+     * Makes a WebQueueBundle with the provided parameters
      *
-     * @param string $dir_name
-     * @param int $filter_size
-     * @param int $num_urls_ram
-     * @param string $min_or_max
+     * @param string $dir_name folder name used by this WebQueueBundle
+     * @param int $filter_size size of each partition in the page exists
+     *      BloomFilterBundle
+     * @param int $num_urls_ram number of entries in ram for the priority queue
+     * @param string $min_or_max when the priority queue maintain the heap
+     *      property with respect to the least or the largest weight
      */
     function __construct($dir_name,
         $filter_size, $num_urls_ram, $min_or_max)
@@ -234,8 +259,9 @@ class WebQueueBundle implements Notifier
     }

     /**
+     * Adds an array of (url, weight) pairs to the WebQueueBundle.
      *
-     * @param array $url_paris
+     * @param array $url_pairs a list of pairs to add
      */
     function addUrlsQueue(&$url_pairs)
     {
@@ -277,8 +303,9 @@ class WebQueueBundle implements Notifier
     }

     /**
-     *
-     * @param string $url
+     * Check is the url queue already contains the given url
+     * @param string $url what to check
+     * @return bool whether it is contained in the queue yet or not
      */
     function containsUrlQueue(&$url)
     {
@@ -288,9 +315,16 @@ class WebQueueBundle implements Notifier
     }

     /**
+     * Adjusts the weight of the given url in the priority queue by amount delta
      *
-     * @param string $url
-     * @param float $delta
+     * In a page importance crawl. a given web page casts its votes on who
+     * to crawl next by splitting its crawl money amongst its child links.
+     * This entails a mechanism for adusting weights of elements in the
+     * priority queue periodically is necessary. This function is used to
+     * solve this problem.
+     *
+     * @param string $url url whose weight in queue we want to adjust
+     * @param float $delta change in weight (usually positive).
      */
     function adjustQueueWeight(&$url, $delta)
     {
@@ -308,8 +342,13 @@ class WebQueueBundle implements Notifier
     }

     /**
+     * Removes a url from the priority queue.
+     *
+     * This method would typical be called during a crawl after the given
+     * url is scheduled to be crawled. It only deletes the item from
+     * the bundles priority queue and hash table -- not from the web archive.
      *
-     * @param string $url
+     * @param string $url the url to delete
      */
     function removeQueue($url)
     {
@@ -331,10 +370,10 @@ class WebQueueBundle implements Notifier
     }

     /**
-     *
-     * @param int $i
-     * @param resource $fh
-     * @return mixed
+     * Gets the url and weight of the ith entry in the priority queue
+     * @param int $i entry to look up
+     * @param resource $fh a file handle to the WebArchive for urls
+     * @return mixed false on error, otherwise the ordered pair in an array
      */
     function peekQueue($i = 1, $fh = NULL)
     {
@@ -365,7 +404,7 @@ class WebQueueBundle implements Notifier
     }

     /**
-     *
+     * Pretty prints the contents of the queue bundle in order
      */
     function printContents()
     {
@@ -378,8 +417,9 @@ class WebQueueBundle implements Notifier
     }

     /**
-     *
-     * @return array
+     * Gets the contents of the queue bundle as an array of ordered url,weight
+     * pairs
+     * @return array a list of ordered url, wight pairs
      */
     function getContents()
     {
@@ -392,8 +432,9 @@ class WebQueueBundle implements Notifier
     }

     /**
-     *
-     * @param int $new_total
+     * Makes the weight sum of the to-crawl priority queue sum to $new_total
+     * @param int $new_total amount weights should sum to. All weights will be
+     *      scaled by the same factor.
      */
     function normalize($new_total = NUM_URLS_QUEUE_RAM)
     {
@@ -403,9 +444,10 @@ class WebQueueBundle implements Notifier
     //Filter and Filter Bundle Methods

     /**
-     *
-     * @param string $mode
-     * @return resource
+     * Opens the url WebArchive associated with this queue bundle in the
+     * given read/write mode
+     * @param string $mode the read/write mode to open the archive with
+     * @return resource a file handle to the WebArchive file
      */
     function openUrlArchive($mode = "r")
     {
@@ -413,8 +455,8 @@ class WebQueueBundle implements Notifier
     }

     /**
-     *
-     * @param resource $fh
+     * Closes a file handle to the url WebArchive
+     * @param resource $fh a valid handle to the url WebArchive file
      */
     function closeUrlArchive($fh)
     {
@@ -422,8 +464,8 @@ class WebQueueBundle implements Notifier
     }

     /**
-     *
-     * @param string $url
+     * Adds the supplied url to the url_exists_filter_bundle
+     * @param string $url url to add
      */
     function addSeenUrlFilter($url)
     {
@@ -431,9 +473,10 @@ class WebQueueBundle implements Notifier
     }

     /**
-     *
-     * @param array &$url_array
-     * @param string $field_name
+     * Removes all url objects from $url_array which have been seen
+     * @param array &$url_array objects to check if have been seen
+     * @param string $field_name component of a url_array element which
+     *      contains a url to check if seen
      */
     function differenceSeenUrls(&$url_array, $field_name = NULL)
     {
@@ -442,8 +485,8 @@ class WebQueueBundle implements Notifier
     }

     /**
-     *
-     * @param string $host
+     * Adds the supplied $host to the got_robottxt_filter
+     * @param string $host url to add
      */
     function addGotRobotTxtFilter($host)
     {
@@ -451,9 +494,9 @@ class WebQueueBundle implements Notifier
     }

     /**
-     *
-     * @param string $host
-     * @return bool
+     * Checks if we have a fresh copy of robots.txt info for $host
+     * @param string $host url to check
+     * @return bool whether we do or not
      */
     function containsGotRobotTxt($host)
     {
@@ -461,18 +504,20 @@ class WebQueueBundle implements Notifier
     }

     /**
-     *
-     * @param string $host
+     * Adds a new entry to the disallowed robot host path Bloom filter
+     * @param string $host_path the path on host that is excluded. For example
+     * http://somewhere.com/bob disallows bob on somewhere.com
      */
-    function addDisallowedRobotFilter($host)
+    function addDisallowedRobotFilter($host_path)
     {
-        $this->dissallowed_robot_filter->add($host);
+        $this->dissallowed_robot_filter->add($host_path);
     }

     /**
-     *
-     * @param string $host_path
-     * @return bool
+     * Checks if the given $host_path is disallowed by the host's
+     * robots.txt info.
+     * @param string $host_path host path to check
+     * @return bool whether it was disallowed or nots
      */
     function containsDisallowedRobot($host_path)
     {
@@ -480,8 +525,9 @@ class WebQueueBundle implements Notifier
     }

     /**
-     *
-     * @return int
+     * Gets when the timestamp of the oldest robot data still stored in
+     * the queue bundle
+     * @return int a Unix timestamp
      */
     function getRobotTxtAge()
     {
@@ -493,9 +539,10 @@ class WebQueueBundle implements Notifier
     }

     /**
+     * Sets the Crawl-delay of $host to passes $value in seconds
      *
-     * @param string $host
-     * @param int $value
+     * @param string $host a host to set the Crawl-delay for
+     * @param int $value a delay in seconds up to 255
      */
     function setCrawlDelay($host, $value)
     {
@@ -511,9 +558,10 @@ class WebQueueBundle implements Notifier
     }

     /**
+     * Gets the Crawl-delay of $host from the crawl delay bloom filter
      *
-     * @param string $host
-     * @return int
+     * @param string $host site to check for a Crawl-delay
+     * @return int the crawl-delay in seconds or -1 if $host has no delay
      */
     function getCrawlDelay($host)
     {
@@ -532,10 +580,15 @@ class WebQueueBundle implements Notifier
     }

     /**
+     * Mainly, a Factory style wrapper around the HashTable's constructor.
+     * However, this function also sets up a rebuild frequency. It is used
+     * as part of the process of keeping the to crawl table from having too
+     * many entries
      *
-     * @param string $name
-     * @param int $num_values
-     * @return object
+     * @param string $name filename to store the hash table persistently
+     * @param int $num_values size of HashTable's arraya
+     * @return object the newly built hash table
+     * @see rebuildHashTable()
      */
     function constructHashTable($name, $num_values)
     {
@@ -545,9 +598,11 @@ class WebQueueBundle implements Notifier
     }

     /**
+     * Looks up $key in the to-crawl hash table
      *
-     * @param string $key
-     * @return string
+     * @param string $key the things to look up
+     * @return mixed would be string if the value is being returned,
+     *      otherwise, false if the key is not found
      */
     function lookupHashTable($key)
     {
@@ -555,12 +610,12 @@ class WebQueueBundle implements Notifier
     }

     /**
-     *
-     * @param string $value
+     * Removes an entries from the to crawl hash table
+     * @param string $key usually a hash of a url
      */
-    function deleteHashTable($value)
+    function deleteHashTable($key)
     {
-        $this->to_crawl_table->delete($value);
+        $this->to_crawl_table->delete($key);
         $this->hash_rebuild_count++;
         if($this->hash_rebuild_count > $this->max_hash_ops_before_rebuild) {
             $this->rebuildHashTable();
@@ -568,10 +623,12 @@ class WebQueueBundle implements Notifier
     }

     /**
+     * Inserts the $key, $value pair into this web queue's to crawl table
      *
-     * @param string $key
-     * @param string $value
-     * @return bool
+     * @param string $key intended to be a hash of a url
+     * @param string $value intended to be offset into a webarchive for urls
+     *      together with an index into the priority queue
+     * @return bool whether the insert was a success or not
      */
     function insertHashTable($key, $value)
     {
@@ -583,7 +640,21 @@ class WebQueueBundle implements Notifier
     }

     /**
+     * Makes a new HashTable without deleted rows
      *
+     * The hash table in Yioop is implemented using open addressing. i.e.,
+     * We store key value pair in the table itself and if there is a collision
+     * we look for the next available slot. Two codes are use to indicate
+     * space available in the table. One to indicate empty never used, the
+     * other used to indicate empty but previously used and deleted. The reason
+     * you need two codes is to ensure that if somebody inserted an item B,
+     * it hashes to the same value as A and we move to the next empty slot,
+     * to store B, then if we delete A we should still be able to lookup B.
+     * The problem is as the table gets reused a lot, it tends to fill up
+     * with a lot of deleted entries making lookup times get more and more
+     * linear in the hash table size. By rebuilding the table we mitigate
+     * against this problem. By choosing the rebuild frequecy appropriately,
+     * the amortized cost of this operation is only O(1).
      */
     function rebuildHashTable()
     {
@@ -688,9 +759,13 @@ class WebQueueBundle implements Notifier
     }

     /**
+     * Callback which is called when an item in the priority queue changes
+     * position. The position is updated in the hash table.
+     * The priority queue stores (hash of url, weight). The hash table
+     * stores (hash of url, web_archive offset to url, index priority queue).
      *
-     * @param int $index
-     * @param array $data
+     * @param int $index new index in priority queue
+     * @param array $data (hash url, weight)
      */
     function notify($index, $data)
     {
ViewGit