Yet more documentation

Chris Pollett [2010-07-30 18:Jul:th]
Yet more documentation
Filename
bin/queue_server.php
controllers/fetch_controller.php
lib/hash_table.php
lib/index_archive_bundle.php
lib/priority_queue.php
lib/processors/doc_processor.php
lib/processors/html_processor.php
lib/processors/pdf_processor.php
lib/processors/ppt_processor.php
lib/processors/rtf_processor.php
lib/processors/text_processor.php
lib/web_archive.php
lib/web_archive_bundle.php
lib/web_queue_bundle.php
diff --git a/bin/queue_server.php b/bin/queue_server.php
index 8ab704959..abfcb19a8 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -154,7 +154,10 @@ class QueueServer implements CrawlConstants
     var $most_recent_fetcher;

     /**
+     * Makes a queue_server object with the supplied indexed_file_types
      *
+     * As part of the creation process, a database manager is initialized so
+     * the queue_server cna make use of its file/folder manipulation functions.
      */
     function __construct($indexed_file_types)
     {
@@ -247,6 +250,8 @@ class QueueServer implements CrawlConstants

     /**
      *
+     * @param array $info
+     * @return array
      */
     function handleAdminMessages($info)
     {
@@ -317,6 +322,7 @@ class QueueServer implements CrawlConstants

     /**
      *
+     * @param array $info
      */
     function startCrawl($info)
     {
@@ -394,6 +400,8 @@ class QueueServer implements CrawlConstants

     /**
      *
+     * @param string $base_dir
+     * @param string $callback_method
      */
     function processDataFile($base_dir, $callback_method)
     {
@@ -441,6 +449,7 @@ class QueueServer implements CrawlConstants

     /**
      *
+     * @param string $file
      */
     function processIndexArchive($file)
     {
@@ -552,6 +561,7 @@ class QueueServer implements CrawlConstants

     /**
      *
+     * @param string $file
      */
     function processRobotArchive($file)
     {
@@ -608,6 +618,7 @@ class QueueServer implements CrawlConstants

     /**
      *
+     * @return array
      */
     function processQueueUrls()
     {
@@ -638,7 +649,8 @@ class QueueServer implements CrawlConstants
     }

     /**
-     *
+     * @param string $file
+     * @return array
      */
     function processDataArchive($file)
     {
@@ -782,6 +794,7 @@ class QueueServer implements CrawlConstants

     /**
      *
+     * @param array &$sites
      */
     function deleteSeenUrls(&$sites)
     {
@@ -790,7 +803,17 @@ class QueueServer implements CrawlConstants


     /**
+     * Produces a schedule.txt file of url data for a fetcher to crawl next.
+     *
+     * The hard part of scheduling is to make sure that the overall crawl
+     * process obeys robots.txt files. This involves checking the url is in
+     * an allowed path for that host and it also involves making sure the
+     * Crawl-delay directive is respected. The first fetcher that contacts the
+     * server requesting data to crawl will get the schedule.txt
+     * produced by produceFetchBatch() at which point it will be unlinked
+     * (these latter thing are controlled in FetchController).
      *
+     * @see FetchController
      */
     function produceFetchBatch()
     {
@@ -985,7 +1008,15 @@ class QueueServer implements CrawlConstants
     }

     /**
+     * Gets the first unfilled schedule slot after $index in $arr
+     *
+     * A schedule of sites for a fetcher to crawl consists of MAX_FETCH_SIZE
+     * many slots earch of which could eventually hold url information.
+     * This function is used to schedule slots for crawl-delayed host.
      *
+     * @param int $index location to begin searching for an empty slot
+     * @param array $arr list of slots to look in
+     * @return int index of first available slot
      */
     function getEarliestSlot($index, $arr)
     {
@@ -1002,7 +1033,11 @@ class QueueServer implements CrawlConstants


     /**
+     * Checks if url belongs to a list of sites that are allowed to be
+     * crawled
      *
+     * @param string $url url to check
+     * @return bool whether is allowed to be crawled or not
      */
     function allowedToCrawlSite($url)
     {
@@ -1018,7 +1053,11 @@ class QueueServer implements CrawlConstants
     }

     /**
+     * Checks if url belongs to a list of sites that aren't supposed to be
+     * crawled
      *
+     * @param string $url url to check
+     * @return bool whether is shouldn't be crawled
      */
     function disallowedToCrawlSite($url)
     {
@@ -1026,7 +1065,14 @@ class QueueServer implements CrawlConstants
     }

     /**
+     * Checks if the url belongs to one of the sites list in site_array
+     * Sites can be either given in the form domain:host or
+     * in the form of a url in which case it is check that the site url
+     * is a substring of the passed url.
      *
+     * @param string $url url to check
+     * @param array $site_array sites to check against
+     * @return bool whether the url belongs to one of the sites
      */
     function urlMemberSiteArray($url, $site_array)
     {
@@ -1049,7 +1095,9 @@ class QueueServer implements CrawlConstants
     }

     /**
+     * Gets a list of all the timestamps of previously stored crawls
      *
+     * @return array list of timestamps
      */
     function getCrawlTimes()
     {
diff --git a/controllers/fetch_controller.php b/controllers/fetch_controller.php
index ae6143797..04adb8cd3 100755
--- a/controllers/fetch_controller.php
+++ b/controllers/fetch_controller.php
@@ -67,6 +67,9 @@ class FetchController extends Controller implements CrawlConstants
     var $activities = array("schedule", "update", "crawlTime");

     /**
+     * Checks that the request seems to be coming from a legitimate fetcher then
+     * determines which activity the fetcher is requesting and calls that
+     * activity for processing.
      *
      */
     function processRequest()
@@ -79,11 +82,12 @@ class FetchController extends Controller implements CrawlConstants
         if(!$this->checkRequest()) {return; }

         $activity = $_REQUEST['a'];
-        $this->$activity();
+        if(in_array($activity, $this->activities)) {$this->$activity();}
     }

     /**
-     *
+     * Checks if there is a schedule of sites to crawl available and
+     * if so present it to the requesting fetcher, and then delete it.
      */
     function schedule()
     {
@@ -106,7 +110,8 @@ class FetchController extends Controller implements CrawlConstants
     }

     /**
-     *
+     * Processes Robot, To Crawl, and Index data sent from a fetcher
+     * Acknowledge to the fetcher if this data was received okay.
      */
     function update()
     {
@@ -144,10 +149,17 @@ class FetchController extends Controller implements CrawlConstants
     }

     /**
-     *  @param &array $sites
-     *  @param string $address
-     *  @param string $day
-     *  @param string $time
+     * Adds a file containing the seen sites and inverted index from the
+     * just received $sites array to the schedules folder's index directory's
+     * subfolder for the current crawl time. This file is added in a sub folder
+     * $day and its name contains the $time at which it arrived and the ip
+     * $address from which it arrived. This file will then be process later
+     * by the queue server.
+     *
+     * @param &array $sites a list of seen sites and an inverted inverted index
+     * @param string $address the IP address of the sending machine with . -->_
+     * @param string $day timestamp in seconds converted to days
+     * @param string $time timestamp in seconds
      */
     function addToIndexSchedules(&$sites, $address, $day, $time)
     {
@@ -171,10 +183,19 @@ class FetchController extends Controller implements CrawlConstants
     }

     /**
-     *  @param &array $sites
-     *  @param string $address
-     *  @param string $day
-     *  @param string $time
+     * Adds a file containing the to-crawl sites from the just received
+     * $sites array to the schedules folder's schedule data directory's
+     * subfolder for the current crawl time. This file is added in a sub folder
+     * $day and its name contains the $time at which it arrived and the ip
+     * $address from which it arrived. This file will then be process later
+     * by the queue server. In addition to to-crawl sites the seen urls
+     * in $sites are also save in the file. They are used to perform a sanity
+     * check on the priority queue by the queue server.
+     *
+     * @param &array $sites a list of seen sites and to crawl sites
+     * @param string $address the IP address of the sending machine with . -->_
+     * @param string $day timestamp in seconds converted to days
+     * @param string $time timestamp in seconds
      */
     function addToCrawlSchedules(&$sites, $address, $day, $time)
     {
@@ -207,10 +228,17 @@ class FetchController extends Controller implements CrawlConstants
     }

     /**
-     *  @param &array $sites
-     *  @param string $address
-     *  @param string $day
-     *  @param string $time
+     * Adds a file containing the robot site data from the just received
+     * $sites array to the schedules folder's robot data directory's
+     * subfolder for the current crawl time. This file is added in a sub folder
+     * $day and its name contains the $time at which it arrived and the ip
+     * $address from which it arrived. This file will then be process later
+     * by the queue server.
+     *
+     * @param &array $sites a list of seen sites and an inverted inverted index
+     * @param string $address the IP address of the sending machine with . -->_
+     * @param string $day timestamp in seconds converted to days
+     * @param string $time timestamp in seconds
      */
     function addRobotSchedules(&$sites, $address, $day, $time)
     {
@@ -228,12 +256,14 @@ class FetchController extends Controller implements CrawlConstants


     /**
+     * Adds a file with contents $data and with name containing $address and
+     * $time to a subfolder $day of a folder $dir
      *
-     *  @param string $dir
-     *  @param &array $data
-     *  @param string $address
-     *  @param string $day
-     *  @param string $time
+     * @param string $dir directory in which to add the schedule file
+     * @param &array $data data that the schedule file is to contain
+     * @param string $address the IP address of the sending machine with . -->_
+     * @param string $day timestamp in seconds converted to days
+     * @param string $time timestamp in seconds
      */
     function addScheduleToScheduleDirectory($dir, &$data, $address, $day, $time)
     {
diff --git a/lib/hash_table.php b/lib/hash_table.php
index 49474e0dd..6f16e89fe 100755
--- a/lib/hash_table.php
+++ b/lib/hash_table.php
@@ -34,9 +34,12 @@
 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

 /**
- *
+ * Loads the base class
  */
 require_once "string_array.php";
+/**
+ * Needed for crawlHash
+ */
 require_once "utility.php";

 /**
@@ -53,51 +56,60 @@ class HashTable extends StringArray
 {

     /**
+     * The size in bytes for keys stored in the hash table
      *
      * @var int
      */
     var $key_size;
     /**
+     * The size in bytes of values associated with keys
      *
      * @var int
      */
     var $value_size;
     /**
-     *
+     * Holds an all \0 string used of length $this->key_size
      * @var string
      */
     var $null;
     /**
-     *
+     * Holds \0\0 followed by an all \FF string of length $this->key_size -1
+     * Used to indicate that a slot once held data but that data was deleted.
+     * Such a slot tells a lookup to keep going, but on an insert can be
+     * overwritten in the inserted key is not already in the table
      * @var string
      */
     var $deleted;
     /**
-     *
+     * Number of items currently in the hash table
      * @var int
      */
     var $count;

     /**
-     *
-     *
+     * Flag for lookup methods
      */
     const ALWAYS_RETURN_PROBE = 1;
     /**
-     *
-     *
+     * Flag for lookup methods
      */
     const RETURN_PROBE_ON_KEY_FOUND = 0;
     /**
-     *
-     *
+     * Flag for lookup methods
      */
     const RETURN_VALUE = -1;


     /**
+     * Makes a persistently stored (i.e., on disk and ram)  hash table using the
+     * supplied parameters
      *
-     *
+     * @param string $fname filename to use when storing the hash table to disk
+     * @param int $num_values number of key value pairs the table can hold
+     * @param int $key_size number of bytes to store a hash table key
+     * @param int $value_size number of bytes to store a hash table value
+     * @param int $save_fequency how many non read operation before saving to
+     *      disk
      */
     function __construct($fname, $num_values, $key_size, $value_size,
         $save_frequency = self::DEFAULT_SAVE_FREQUENCY)
@@ -114,8 +126,12 @@ class HashTable extends StringArray
     }

     /**
+     * Inserts the provided $key - $value pair into the hash table
      *
-     *
+     * @param string $key the key to use for the insert (will be needed for
+     *      lookup)
+     * @param string $value the value associated with $key
+     * @return bool whether the insert was succesful or not
      */
     function insert($key, $value)
     {
@@ -162,8 +178,19 @@ class HashTable extends StringArray


     /**
+     * Tries to lookup the key in the hash table either return the
+     * location where it was found or the value associated with the key.
      *
-     *
+     * @param string $key key to look up in the hash table
+     * @param int $return_probe_value one of self::ALWAYS_RETURN_PROBE,
+     *      self::RETURN_PROBE_ON_KEY_FOUND, or self::RETURN_VALUE. Here
+     *      value means the value associated with the key and probe is
+     *      either the location in the array where the key was found or
+     *      the first location in the array where it was determined the
+     *      key could not be found.
+     * @return mixed would be string if the value is being returned,
+     *      an int if the probe is being returned, and false if the key
+     *      is not found
      */
     function lookup($key, $return_probe_value = self::RETURN_VALUE)
     {
@@ -172,8 +199,26 @@ class HashTable extends StringArray
     }

     /**
+     * Tries to lookup the key in the hash table either return the
+     * location where it was found or the value associated with the key.
+     * If the key is not at the initial probe value, linear search in the
+     * table is done. The values which cut-off the search are stored in
+     * $null_array. Using an array allows for flexibility since a deleted
+     * entry needs to be handled different when doing a lookup then when
+     * doing an insert.
      *
-     *
+     * @param string $key key to look up in the hash table
+     * @param array $null_array key values that would cut-off the search
+     *      for key if the initial probe failed
+     * @param int $return_probe_value one of self::ALWAYS_RETURN_PROBE,
+     *      self::RETURN_PROBE_ON_KEY_FOUND, or self::RETURN_VALUE. Here
+     *      value means the value associated with the key and probe is
+     *      either the location in the array where the key was found or
+     *      the first location in the array where it was determined the
+     *      key could not be found.
+     * @return mixed would be string if the value is being returned,
+     *      an int if the probe is being returned, and false if the key
+     *      is not found
      */
     function lookupArray($key, $null_array,
         $return_probe_value = self::RETURN_VALUE)
@@ -212,8 +257,10 @@ class HashTable extends StringArray
     }

     /**
+     * Deletes the data associated with the provided key from the hash table
      *
-     *
+     * @param string $key the key to delete the entry for
+     * @return bool whether or not something was deleted
      */
     function delete($key)
     {
@@ -234,8 +281,10 @@ class HashTable extends StringArray
     }

     /**
+     * Get the ith entry of the array for the hash table (no hashing here)
      *
-     *
+     * @param int $i an index of the hash table array
+     * @return array the key value pair stored at this index
      */
     function getEntry($i)
     {
@@ -247,8 +296,10 @@ class HashTable extends StringArray
     }

     /**
+     * Hashes the provided key to an index in the array of the hash table
      *
-     *
+     * @param string $key a key to hashed into the hash table
+     * @return int an index in the array of the hash table
      */
     function hash($key)
     {
diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php
index 4a1358dc3..b06780150 100644
--- a/lib/index_archive_bundle.php
+++ b/lib/index_archive_bundle.php
@@ -132,29 +132,79 @@ function setOffsetPointers($data, &$objects, $offset_field)
  */
 class WordIterator implements IndexingConstants, CrawlConstants
 {
+    /**
+     *
+     */
     var $word_key;
+    /**
+     *
+     */
     var $index;
+    /**
+     *
+     */
     var $seen_docs;
+    /**
+     *
+     */
     var $restricted_seen_docs;
+    /**
+     *
+     */
     var $count_block_unfiltered;
+    /**
+     *
+     */
     var $num_docs;
-

+    /**
+     *
+     */
     var $next_offset;
+    /**
+     *
+     */
     var $last_pointed_block;
+    /**
+     *
+     */
     var $list_offset;

-
+    /**
+     *
+     */
     var $block_pointers;
+    /**
+     *
+     */
     var $num_full_blocks;
+    /**
+     *
+     */
     var $num_generations;
+    /**
+     *
+     */
     var $last_block;
+    /**
+     *
+     */
     var $info_block;
+    /**
+     *
+     */
     var $current_pointer;
+    /**
+     *
+     */
     var $limit;

     /**
      *
+     * @param string $word_key
+     * @param object $index
+     * @param int $limit
+     * @param object $info_block
      */
     public function __construct($word_key, $index, $limit = 0, $info_block = NULL)
     {
@@ -166,6 +216,7 @@ class WordIterator implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param object $info_block
      */
     public function reset($info_block = NULL)
     {
@@ -215,6 +266,7 @@ class WordIterator implements IndexingConstants, CrawlConstants

     /**
      *
+     * @return bool
      */
     public function initGeneration()
     {
@@ -267,6 +319,8 @@ class WordIterator implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param array $restrict_phrases
+     * @return array
      */
     public function currentDocsWithWord($restrict_phrases = NULL)
     {
@@ -389,6 +443,8 @@ class WordIterator implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param array $restrict_phrases
+     * @return array
      */
     public function nextDocsWithWord($restrict_phrases = NULL)
     {
@@ -444,6 +500,11 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param string $dir_name
+     * @param int $filter_size
+     * @param int $num_partitions_summaries
+     * @param int $num_parititions_index
+     * @param string $description
      */
     public function __construct($dir_name, $filter_size = -1,
         $num_partitions_summaries = NULL, $num_partitions_index = NULL,
@@ -486,6 +547,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param string $key_field
+     * @param string $offset_field
+     * @param array $pages
+     * @return array
      */
     public function addPages($key_field, $offset_field, $pages)
     {
@@ -496,6 +561,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param array $index_data
      */
     public function addIndexData($index_data)
     {
@@ -549,6 +615,9 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param int $partition
+     * @param array &$word_data
+     * @param bool $overwrite
      */
     public function addPartitionWordData($partition,
         &$word_data, $overwrite = false)
@@ -665,6 +734,9 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param int $partition
+     * @param string $word_key
+     * @return bool
      */
     public function addPartitionIndexFilter($partition, $word_key)
     {
@@ -681,6 +753,8 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param int $partition
+     * @return bool
      */
     public function initPartitionIndexFilter($partition)
     {
@@ -704,6 +778,13 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param string $word_key
+     * @param int $Lint
+     * @param int $num
+     * @param array $restrict_phrases
+     * @param string $phrase_key
+     * @param array $phrase_info
+     * @return array
      */
     public function getSummariesByHash($word_key, $limit, $num,
         $restrict_phrases = NULL, $phrase_key = NULL, $phrase_info = NULL)
@@ -751,6 +832,9 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param string $key
+     * @param int $offset
+     * @return array
      */
     public function getPage($key, $offset)
     {
@@ -759,6 +843,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param string $word_key
+     * @param int $offset
+     * @param int $generation
+     * @return array
      */
     public function getWordDocBlock($word_key, $offset, $generation = -1)
     {
@@ -773,6 +861,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param int $partition
+     * @param int $offset
+     * @param resource $file_handle
+     * @return array
      */
     public function getPageByPartition($partition, $offset, $file_handle = NULL)
     {
@@ -782,6 +874,8 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param string $key_field
+     * @param array $page
      */
     public function addPageFilter($key_field, $page)
     {
@@ -813,6 +907,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param string $phrase_key
+     * @param int $generation_index
+     * @param array $info_block
+     * @return array
      */
     public function getPhraseIndexInfo(
         $phrase_key, $generation_index = 0, $info_block = NULL)
@@ -889,6 +987,8 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param string $phrase_key
+     * @param array $info
      */
     public function setPhraseIndexInfo($phrase_key, $info)
     {
@@ -909,6 +1009,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param string $word_key
+     * @param array $restrict_phrases
+     * @param string $phrase_key
+     * @param $num_needed
      */
     public function addPhraseIndex($word_key, $restrict_phrases,
         $phrase_key, $num_needed)
@@ -979,6 +1083,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param array $word_keys
+     * @param int $num
+     * @param string $comparison
+     * @return array
      */
     public function getSelectiveWords($word_keys, $num, $comparison="lessThan")
         //lessThan is in utility.php
@@ -1002,6 +1110,9 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param int $partition
+     * @param int $generation
+     * @return array
      */
     public function readPartitionInfoBlock($partition, $generation = -1)
     {
@@ -1017,6 +1128,8 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

     /**
      *
+     * @param int $partition
+     * @param array $data
      */
     public function writePartitionInfoBlock($partition, $data)
     {
diff --git a/lib/priority_queue.php b/lib/priority_queue.php
index b50468948..763863c3d 100755
--- a/lib/priority_queue.php
+++ b/lib/priority_queue.php
@@ -38,15 +38,15 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
  */
 require_once "string_array.php";
 /**
- *
+ * A Notifier is called when data in the queue is move around
  */
 require_once "notifier.php";
 /**
- *
+ * Loaded for crawlLog function
  */
 require_once "utility.php";
 /**
- *
+ * Constants shared amoung classes involved in storing web crawl data
  */
 require_once "crawl_constants.php";

@@ -63,40 +63,50 @@ require_once "crawl_constants.php";
 class PriorityQueue extends StringArray implements CrawlConstants
 {
     /**
-     *
-     *
+     * Number of values that can be stored in the priority queue
+     * @var int
      */
     var $num_values;
     /**
-     *
-     *
+     * Number of bytes needed to store a value associated with a weight
+     * @var int
      */
     var $value_size;
     /**
-     *
-     *
+     * Number of bytes needed to store a weight in the queue
+     * @var int
      */
     var $weight_size = 4; //size of a float

     /**
-     *
-     *
+     * Number of items that are currently stored in the queue
+     * @var int
      */
     var $count;
     /**
-     *
-     *
+     * When the polling the queue returns the least or most weighted value
+     * @var string
      */
     var $min_or_max;

     /**
-     *
-     *
+     * An object that implements the Notifier interface (for instance,
+     * WebQueueArchive)
+     * @var object
      */
     var $notifier; // who to call if move an item in queue

     /**
+     * Makes a priority queue with the given operating parameters
      *
+     * @param string $fname filename to store the data associated with the queue
+     * @param int $num_values number of values the queue can hold
+     * @param int $value_size the size in a bytes of a value
+     * @param string $min_or_max whether this priority queue return least or
+     *  most weight values when polled
+     * @param object $notifier object to call when a value changes in the queue
+     * @param int $save_frequency how often the data in the queue should be
+     *      save to disk. (It's default location is RAM)
      */
     function __construct($fname, $num_values, $value_size,
         $min_or_max, $notifier = NULL,
@@ -116,7 +126,11 @@ class PriorityQueue extends StringArray implements CrawlConstants
     }

     /**
+     * Gets the data stored at the ith location in the priority queue
      *
+     * @param int $i location to return data from
+     * @return mixed data if the value of $i is between 1 and count, false
+     *      otherwise
      */
     function peek($i = 1)
     {
@@ -129,6 +143,8 @@ class PriorityQueue extends StringArray implements CrawlConstants

     /**
      *
+     * @param int $i
+     * @return mixed
      */
     function poll($i = 1)
     {
@@ -149,7 +165,9 @@ class PriorityQueue extends StringArray implements CrawlConstants
     }

     /**
-     *
+     * @param string $data
+     * @param float $weight
+     * @return mixed
      */
     function insert($data, $weight)
     {
@@ -168,6 +186,8 @@ class PriorityQueue extends StringArray implements CrawlConstants

     /**
      *
+     * @param int $i
+     * @param float $delta
      */
     function adjustWeight($i, $delta)
     {
@@ -209,6 +229,7 @@ class PriorityQueue extends StringArray implements CrawlConstants

     /**
      *
+     * @return array
      */
     function getContents()
     {
@@ -222,6 +243,7 @@ class PriorityQueue extends StringArray implements CrawlConstants

     /**
      *
+     * @param int $new_total
      */
     function normalize($new_total = NUM_URLS_QUEUE_RAM)
     {
@@ -246,6 +268,8 @@ class PriorityQueue extends StringArray implements CrawlConstants

     /**
      *
+     * @param int $i
+     * @return int
      */
     function percolateUp($i)
     {
@@ -274,6 +298,8 @@ class PriorityQueue extends StringArray implements CrawlConstants

     /**
      *
+     * @param int $i
+     * @return int
      */
     function percolateDown($i)
     {
@@ -314,6 +340,9 @@ class PriorityQueue extends StringArray implements CrawlConstants

     /**
      *
+     * @param float $value1
+     * @param float $value2
+     * @return float
      */
     function compare($value1, $value2)
     {
@@ -326,6 +355,9 @@ class PriorityQueue extends StringArray implements CrawlConstants

     /**
      *
+     *
+     * @param int $i
+     * @return array
      */
     function getRow($i)
     {
@@ -345,7 +377,13 @@ class PriorityQueue extends StringArray implements CrawlConstants
     }

     /**
+     * Add data to the $i row of the priority queue viewed as an array
+     * Calls the notifier associated with this queue about the change
+     * in data's location
      *
+     * @param int $i location to add data
+     * @param array $row data to add (a two element array in the form
+     *      key, float value).
      */
     function putRow($i, $row)
     {
@@ -359,7 +397,9 @@ class PriorityQueue extends StringArray implements CrawlConstants
     }

     /**
+     * Computes and returns the weight of all items in prority queue
      *
+     * @return float weight of all items stored in the priority queue
      */
     function totalWeight()
     {
diff --git a/lib/processors/doc_processor.php b/lib/processors/doc_processor.php
index 1260db9e9..bd57b603a 100755
--- a/lib/processors/doc_processor.php
+++ b/lib/processors/doc_processor.php
@@ -34,7 +34,7 @@
 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

 /**
- *
+ * Load the base class
  */

 require_once BASE_DIR."/lib/processors/text_processor.php";
diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php
index e24e3ad03..ec7930e8e 100755
--- a/lib/processors/html_processor.php
+++ b/lib/processors/html_processor.php
@@ -34,9 +34,12 @@
 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

 /**
- * Load base class, if needed. We also might need to parse urls
+ * Load base class, if needed.
  */
 require_once BASE_DIR."/lib/processors/text_processor.php";
+/**
+ * Load so can parse urls
+ */
 require_once BASE_DIR."/lib/url_parser.php";

  /**
diff --git a/lib/processors/pdf_processor.php b/lib/processors/pdf_processor.php
index d83df545e..b8e6fb415 100755
--- a/lib/processors/pdf_processor.php
+++ b/lib/processors/pdf_processor.php
@@ -79,7 +79,10 @@ class PdfProcessor extends TextProcessor
     }

     /**
+     * Gets the text out of a PDF document
      *
+     * @param string $pdf_string a string representing the PDF document
+     * @return string text extracted from the document
      */
     static function getText($pdf_string) {
         $len = strlen($pdf_string);
@@ -128,7 +131,12 @@ class PdfProcessor extends TextProcessor
     }

     /**
+     * Gets between an obj and endobj tag at the current position in a PDF
+     * document
      *
+     * @param string $pdf_string astring of a PDF document
+     * @param int $cur_pos a integer postion in that string
+     * @return string the contents of the PDF object located at $cur_pos
      */
     static function getNextObject($pdf_string, $cur_pos)
     {
@@ -137,11 +145,13 @@ class PdfProcessor extends TextProcessor

     /**
      *
+     * @param string $object_dictionary
+     * @param array $type_array
      */
-    static function objectDictionaryHas($object_Dictionary, $type_array)
+    static function objectDictionaryHas($object_dictionary, $type_array)
     {
         foreach ($type_array as $type) {
-            if(strstr($object_Dictionary, $type)) {
+            if(strstr($object_dictionary, $type)) {
                 return true;
             }
         }
@@ -151,6 +161,8 @@ class PdfProcessor extends TextProcessor

     /**
      *
+     * @param string $object_string
+     * @return string
      */
     static function getObjectDictionary($object_string)
     {
@@ -161,6 +173,8 @@ class PdfProcessor extends TextProcessor

     /**
      *
+     * @param string $object_stream
+     * @return string
      */
     static function getObjectStream($object_string)
     {
@@ -172,6 +186,8 @@ class PdfProcessor extends TextProcessor

     /**
      *
+     * @param string $data
+     * @return string
      */
     static function parseText($data)
     {
@@ -212,6 +228,9 @@ class PdfProcessor extends TextProcessor

     /**
      *
+     * @param string $data
+     * @param int $cur_pos
+     * @return array
      */
     static function parseBrackets($data, $cur_pos)
     {
@@ -250,6 +269,9 @@ class PdfProcessor extends TextProcessor

     /**
      *
+     * @param string $data
+     * @param int $cur_pos
+     * @return array
      */
     static function parseParentheses($data, $cur_pos)
     {
diff --git a/lib/processors/ppt_processor.php b/lib/processors/ppt_processor.php
index e49d05ada..0d72ea581 100755
--- a/lib/processors/ppt_processor.php
+++ b/lib/processors/ppt_processor.php
@@ -39,7 +39,7 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
 require_once BASE_DIR."/lib/processors/text_processor.php";


- /**
+/**
  * Used to create crawl summary information
  * for PPT files
  *
@@ -58,7 +58,21 @@ class PptProcessor extends TextProcessor
     const ALWAYS_IGNORE = 6;

     /**
+     * Computes a summary based on a string of a binary Powerpoint document
+     * (as opposed to the modern xml powerpoint format).
+     *
+     * Text is extracted from the Powerpoint document using a crude finite
+     * state machine that was developed by looking at a few Powerpoint
+     * documents in a Hex editor. Then the TextProcessor:: process() method
+     * is used to make a summary
      *
+     * @param string $page string of a Powerpoint document
+     * @param string $url location the document came from, not used by
+     *      TextProcessor at this point. Some of its subclasses override
+     *      this method and use url to produce complete links for
+     *      relative links within a document
+     * @return array a summary of (title, description,links, and content) of
+     *      the information in $page
      */
     public static function process($page, $url)
     {
diff --git a/lib/processors/rtf_processor.php b/lib/processors/rtf_processor.php
index 16e504292..96994dfb7 100755
--- a/lib/processors/rtf_processor.php
+++ b/lib/processors/rtf_processor.php
@@ -50,7 +50,9 @@ class RtfProcessor extends TextProcessor
 {

     /**
-     *
+     * @param string $page
+     * @param string $url
+     * @return array
      */
     public static function process($page, $url)
     {
@@ -71,6 +73,8 @@ class RtfProcessor extends TextProcessor

     /**
      *
+     * @param string $rtf_string
+     * @return string
      */
     static function extractText($rtf_string) {
         $rtf_string = preg_replace('/\\\{/',"!ZZBL!", $rtf_string);
@@ -89,6 +93,8 @@ class RtfProcessor extends TextProcessor

     /**
      *
+     * @param string $rtf_string
+     * @return string
      */
     static function getText($rtf_string)
     {
@@ -125,6 +131,9 @@ class RtfProcessor extends TextProcessor

     /**
      *
+     * @param string $rtf_string
+     * @param int $cur_pos
+     * @return string
      */
     static function getNextObject($rtf_string, $cur_pos)
     {
diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php
index 8773576d3..ad518840c 100755
--- a/lib/processors/text_processor.php
+++ b/lib/processors/text_processor.php
@@ -34,7 +34,7 @@
 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

 /**
- *
+ * Loads in the common constant used by all classes related to crawling
  */
 require_once BASE_DIR."/lib/crawl_constants.php";

@@ -51,7 +51,15 @@ class TextProcessor implements CrawlConstants
 {

     /**
+     * Computes a summary based on a text string of a document
      *
+     * @param string $page text string of a document
+     * @param string $url location the document came from, not used by
+     *      TextProcessor at this point. Some of its subclasses override
+     *      this method and use url to produce complete links for
+     *      relative links within a document
+     * @return array a summary of (title, description,links, and content) of
+     *      the information in $page
      */
     static function process($page, $url)
     {
@@ -65,7 +73,15 @@ class TextProcessor implements CrawlConstants
     }

     /**
+     * Gets the text between two tags in a document starting at the current
+     * position.
      *
+     * @param string $string document to extract text from
+     * @param int $cur_pos current location to look if can extract text
+     * @param string $start_tag starting tag that we want to extract after
+     * @param string $end_tag ending tag that we want to extract until
+     * @return array pair consisting of when in the document we are after
+     *      the end tag, together with the data between the two tags
      */
     static function getBetweenTags($string, $cur_pos, $start_tag, $end_tag)
     {
diff --git a/lib/web_archive.php b/lib/web_archive.php
index b41a00dc7..85ffea504 100755
--- a/lib/web_archive.php
+++ b/lib/web_archive.php
@@ -81,6 +81,9 @@ class WebArchive
     const CLOSE = 3;
     /**
      *
+     * @param string $fname
+     * @param string $compressor
+     * @param bool $fast_construct
      */
     function __construct($fname, $compressor, $fast_construct = false)
     {
@@ -102,6 +105,7 @@ class WebArchive

     /**
      *
+     * @return array
      */
     function readInfoBlock()
     {
@@ -120,6 +124,8 @@ class WebArchive

     /**
      *
+     * @param resource $fh
+     * @param array &$data
      */
     function writeInfoBlock($fh = NULL, &$data = NULL)
     {
@@ -148,6 +154,8 @@ class WebArchive

     /**
      *
+     * @param resource $fh
+     * @return int
      */
     function seekEndObjects($fh)
     {
@@ -161,6 +169,12 @@ class WebArchive

     /**
      *
+     * @param string $offset_field
+     * @param array &$objects
+     * @param array $data
+     * @param string $callback
+     * @param bool $return_flag
+     * @return mixed
      */
     function addObjects($offset_field, &$objects,
         $data = NULL, $callback = NULL, $return_flag = true)
@@ -211,6 +225,8 @@ class WebArchive

     /**
      *
+     * @param string $mode
+     * @return resource
      */
     function open($mode = "r")
     {
@@ -228,6 +244,11 @@ class WebArchive

     /**
      *
+     * @param int $offset
+     * @param int $num
+     * @param bool $next_flag
+     * @param resource $fh
+     * @return array
      */
     function getObjects($offset, $num, $next_flag = true, $fh = NULL)
     {
diff --git a/lib/web_archive_bundle.php b/lib/web_archive_bundle.php
index ff7fac6d4..9f77ca1e8 100755
--- a/lib/web_archive_bundle.php
+++ b/lib/web_archive_bundle.php
@@ -103,6 +103,11 @@ class WebArchiveBundle

     /**
      *
+     * @param string $dir_name
+     * @param int $filter_size
+     * @param int $num_partitions
+     * @param string $description
+     * @param string $compressor
      */
     function __construct($dir_name, $filter_size = -1,
         $num_partitions = NULL, $description = NULL,
@@ -172,6 +177,10 @@ class WebArchiveBundle

     /**
      *
+     * @param string $key_field
+     * @param string $offset_field
+     * @param array &$pages
+     * @return array
      */
     function addPages($key_field, $offset_field, &$pages)
     {
@@ -214,6 +223,9 @@ class WebArchiveBundle

     /**
      *
+     * @param string $key
+     * @param int $offset
+     * @return array
      */
     function getPage($key, $offset)
     {
@@ -225,6 +237,10 @@ class WebArchiveBundle

     /**
      *
+     * @param int $partition
+     * @param int $offset
+     * @param resource $file_handle
+     * @return array
      */
     function getPageByPartition($partition, $offset, $file_handle = NULL)
     {
@@ -241,6 +257,9 @@ class WebArchiveBundle

     /**
      *
+     * @param string $key_field
+     * @param array &$page
+     * @return bool
      */
     function addPageFilter($key_field, &$page)
     {
@@ -254,6 +273,13 @@ class WebArchiveBundle

     /**
      *
+     * @param string $offset_field
+     * @param int $partition
+     * @param array &$objects
+     * @param array $data
+     * @param string $callback
+     * @param bool $return_flag
+     * @return mixed
      */
     function addObjectsPartition($offset_field, $partition,
         &$objects, $data = NULL, $callback = NULL, $return_flag = true)
@@ -267,6 +293,8 @@ class WebArchiveBundle

     /**
      *
+     * @param int $partition
+     * @return array
      */
     function readPartitionInfoBlock($partition)
     {
@@ -275,6 +303,8 @@ class WebArchiveBundle

     /**
      *
+     * @param int $partition
+     * @param array $data
      */
     function writePartitionInfoBlock($partition, &$data)
     {
@@ -283,6 +313,9 @@ class WebArchiveBundle

     /**
      *
+     * @param array $pages
+     * @param string $key_field
+     * @return mixed
      */
     function differencePageKeysFilter($pages, $key_field)
     {
@@ -302,6 +335,8 @@ class WebArchiveBundle

     /**
      *
+     * @param array &$page_array
+     * @param string $field_name
      */
     function differencePagesFilter(&$page_array, $field_name = NULL)
     {
@@ -321,6 +356,8 @@ class WebArchiveBundle

     /**
      *
+     * @param int $index
+     * @param bool $fast_construct
      */
     function getPartition($index, $fast_construct = true)
     {
@@ -343,6 +380,7 @@ class WebArchiveBundle

     /**
      *
+     * @param int $num
      */
     function addCount($num)
     {
@@ -354,6 +392,8 @@ class WebArchiveBundle

     /**
      *
+     * @param string $dir_name
+     * @return array
      */
     static function getArchiveInfo($dir_name)
     {
@@ -374,6 +414,9 @@ class WebArchiveBundle

     /**
      *
+     * @param string $value
+     * @param int $num_partitions
+     * @return int
      */
     static function selectPartition($value, $num_partitions)
     {
diff --git a/lib/web_queue_bundle.php b/lib/web_queue_bundle.php
index f64f21f67..20eb54ef3 100755
--- a/lib/web_queue_bundle.php
+++ b/lib/web_queue_bundle.php
@@ -145,6 +145,10 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param string $dir_name
+     * @param int $filter_size
+     * @param int $num_urls_ram
+     * @param string $min_or_max
      */
     function __construct($dir_name,
         $filter_size, $num_urls_ram, $min_or_max)
@@ -231,6 +235,7 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param array $url_paris
      */
     function addUrlsQueue(&$url_pairs)
     {
@@ -273,6 +278,7 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param string $url
      */
     function containsUrlQueue(&$url)
     {
@@ -283,6 +289,8 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param string $url
+     * @param float $delta
      */
     function adjustQueueWeight(&$url, $delta)
     {
@@ -301,6 +309,7 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param string $url
      */
     function removeQueue($url)
     {
@@ -323,6 +332,9 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param int $i
+     * @param resource $fh
+     * @return mixed
      */
     function peekQueue($i = 1, $fh = NULL)
     {
@@ -365,6 +377,10 @@ class WebQueueBundle implements Notifier
         }
     }

+    /**
+     *
+     * @return array
+     */
     function getContents()
     {
         $count = $this->to_crawl_queue->count;
@@ -377,6 +393,7 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param int $new_total
      */
     function normalize($new_total = NUM_URLS_QUEUE_RAM)
     {
@@ -387,6 +404,8 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param string $mode
+     * @return resource
      */
     function openUrlArchive($mode = "r")
     {
@@ -395,6 +414,7 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param resource $fh
      */
     function closeUrlArchive($fh)
     {
@@ -403,6 +423,7 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param string $url
      */
     function addSeenUrlFilter($url)
     {
@@ -411,6 +432,8 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param array &$url_array
+     * @param string $field_name
      */
     function differenceSeenUrls(&$url_array, $field_name = NULL)
     {
@@ -420,6 +443,7 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param string $host
      */
     function addGotRobotTxtFilter($host)
     {
@@ -428,6 +452,8 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param string $host
+     * @return bool
      */
     function containsGotRobotTxt($host)
     {
@@ -436,6 +462,7 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param string $host
      */
     function addDisallowedRobotFilter($host)
     {
@@ -444,6 +471,8 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param string $host_path
+     * @return bool
      */
     function containsDisallowedRobot($host_path)
     {
@@ -452,6 +481,7 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @return int
      */
     function getRobotTxtAge()
     {
@@ -464,6 +494,8 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param string $host
+     * @param int $value
      */
     function setCrawlDelay($host, $value)
     {
@@ -480,6 +512,8 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param string $host
+     * @return int
      */
     function getCrawlDelay($host)
     {
@@ -499,6 +533,9 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param string $name
+     * @param int $num_values
+     * @return object
      */
     function constructHashTable($name, $num_values)
     {
@@ -509,6 +546,8 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param string $key
+     * @return string
      */
     function lookupHashTable($key)
     {
@@ -517,6 +556,7 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param string $value
      */
     function deleteHashTable($value)
     {
@@ -529,6 +569,9 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param string $key
+     * @param string $value
+     * @return bool
      */
     function insertHashTable($key, $value)
     {
@@ -616,6 +659,7 @@ class WebQueueBundle implements Notifier
     }

     /**
+     *
      *
      */
     function emptyRobotFilters()
@@ -645,6 +689,8 @@ class WebQueueBundle implements Notifier

     /**
      *
+     * @param int $index
+     * @param array $data
      */
     function notify($index, $data)
     {
ViewGit