diff --git a/bin/queue_server.php b/bin/queue_server.php index abfcb19a8..40da07eb4 100755 --- a/bin/queue_server.php +++ b/bin/queue_server.php @@ -157,7 +157,7 @@ class QueueServer implements CrawlConstants * Makes a queue_server object with the supplied indexed_file_types * * As part of the creation process, a database manager is initialized so - * the queue_server cna make use of its file/folder manipulation functions. + * the queue_server can make use of its file/folder manipulation functions. */ function __construct($indexed_file_types) { @@ -190,7 +190,11 @@ class QueueServer implements CrawlConstants } /** + * Main runtime loop of the queue_server. * + * Loops until a stop message received, check for start, stop, resume + * crawl messages, deletes any WebQueueBundle for which an + * IndexArchiveBundle does not exist. Processes */ function loop() { @@ -249,9 +253,15 @@ class QueueServer implements CrawlConstants } /** + * Handles messages passed via files to the QueueServer. * - * @param array $info - * @return array + * These files are typically written by the CrawlDaemon::init() + * when QueueServer is run using command-line argument + * + * @param array $info associative array with info about current state of + * queue_server + * @return array an updates version $info reflecting changes that occurred + * during the handling of the admin messages files. */ function handleAdminMessages($info) { @@ -274,7 +284,7 @@ class QueueServer implements CrawlConstants } if(isset($this->index_archive)) { $this->index_archive->forceSave(); - // chmod so apahce can also write to these directories + // chmod so apache can also write to these directories $this->db->setWorldPermissionsRecursive( CRAWL_DIR.'/cache/'. self::index_data_base_name.$this->crawl_time); @@ -321,8 +331,11 @@ class QueueServer implements CrawlConstants } /** + * Begins crawling base on time, order, restricted site $info + * Setting up a crawl involves creating a queue bundle and an + * index archive bundle * - * @param array $info + * @param array $info parameter for the crawl */ function startCrawl($info) { @@ -378,7 +391,8 @@ class QueueServer implements CrawlConstants } /** - * + * Delete all the queue schedules in the cache that don't have an + * associated index bundle as this means that crawl has been deleted. */ function deleteOrphanedBundles() { @@ -399,9 +413,13 @@ class QueueServer implements CrawlConstants } /** - * - * @param string $base_dir - * @param string $callback_method + * Generic function used to process Data, Index, and Robot info schedules + * Finds the first file in the the direcotry of schedules of the given + * type, and calls the appropriate callback method for that type. + * + * @param string $base_dir directory for of schedules + * @param string $callback_method what method should be called to handle + * a schedule */ function processDataFile($base_dir, $callback_method) { @@ -435,7 +453,9 @@ class QueueServer implements CrawlConstants } /** - * + * Sets up the directory to look for a file of unprocessed + * index archive data from fetchers then calls the function + * processDataFile to process the oldest file found */ function processIndexData() { @@ -448,8 +468,10 @@ class QueueServer implements CrawlConstants } /** + * Adds the summary and index data in $file to summary bundle and word index * - * @param string $file + * @param string $file containing web pages summaries and a mini-inverted + * index for their content */ function processIndexArchive($file) { @@ -538,7 +560,9 @@ class QueueServer implements CrawlConstants } /** - * + * Checks how old the oldest robot data is and dumps if older then a + * threshold, then sets up the path to the robot schedule directory + * and tries to process a file of robots.txt robot paths data from there */ function processRobotUrls() { @@ -560,8 +584,11 @@ class QueueServer implements CrawlConstants } /** - * - * @param string $file + * Reads in $file of robot data adding host-paths to the disallowed + * robot filter and setting the delay in the delay filter of + * crawled delayed hosts + * @param string $file file to read of robot data, is removed after + * processing */ function processRobotArchive($file) { @@ -602,7 +629,13 @@ class QueueServer implements CrawlConstants } /** - * + * Deletes all Robot informations stored by the QueueServer. + * + * This function is called roughly every CACHE_ROBOT_TXT_TIME. + * It forces the crawler to redownload robots.txt files before hosts + * can be continued to be crawled. This ensures if the cache robots.txt + * file is never too old. Thus, if someone changes it to allow or disallow + * the crawler it will be noticed reasonably promptly. */ function deleteRobotData() { @@ -612,13 +645,16 @@ class QueueServer implements CrawlConstants self::robot_data_base_name.$this->crawl_time; $this->db->unlinkRecursive($robot_schedules, true); - crawlLog("... reseting robot bloom filters ..."); + crawlLog("... resetting robot bloom filters ..."); $this->web_queue->emptyRobotFilters(); } /** + * Checks for a new crawl file or a schedule data for the current crawl and + * if such a exists then processes its contents adding the relevant urls to + * the priority queue * - * @return array + * @return array info array with continue status */ function processQueueUrls() { @@ -631,10 +667,9 @@ class QueueServer implements CrawlConstants if(file_exists(CRAWL_DIR."/schedules/".self::schedule_start_name)) { crawlLog( "Start schedule urls".CRAWL_DIR. - "/schedules/".self::schedule_start_name); - $info = array_merge($info, - $this->processDataArchive( - CRAWL_DIR."/schedules/".self::schedule_start_name)); + "/schedules/".self::schedule_start_name); + $this->processDataArchive( + CRAWL_DIR."/schedules/".self::schedule_start_name); return $info; } @@ -649,8 +684,11 @@ class QueueServer implements CrawlConstants } /** - * @param string $file - * @return array + * Process a file of to-crawl urls adding to or adjusting the weight in + * the PriorityQueue of those which have not been seen. Also + * updates the queue with seen url info + * + * @param string $file containing serialized to crawl and seen url info */ function processDataArchive($file) { @@ -658,8 +696,6 @@ class QueueServer implements CrawlConstants $sites = unserialize(file_get_contents($file)); - $info = array(); - if(isset($sites[self::MACHINE])) { $this->most_recent_fetcher = $sites[self::MACHINE]; } @@ -788,13 +824,12 @@ class QueueServer implements CrawlConstants crawlLog("URL: $url"); } - return $info; - } /** + * Removes the already seen urls from the supplied array * - * @param array &$sites + * @param array &$sites url data to check if seen */ function deleteSeenUrls(&$sites) { diff --git a/lib/bloom_filter_bundle.php b/lib/bloom_filter_bundle.php index 32ff0be7b..6962e7b34 100644 --- a/lib/bloom_filter_bundle.php +++ b/lib/bloom_filter_bundle.php @@ -141,8 +141,8 @@ class BloomFilterBundle } /** - * Removes from the passed array those elements $elt who either are not in - * the filter bundle or whose $elt[$field_name] is not in the bundle. + * Removes from the passed array those elements $elt who either are in + * the filter bundle or whose $elt[$field_name] is in the bundle. * * @param array &$arr the array to remove elements from * @param string $field_name if not NULL the field name of $arr to use to diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php index b06780150..0b759e982 100644 --- a/lib/index_archive_bundle.php +++ b/lib/index_archive_bundle.php @@ -38,27 +38,19 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} */ require_once 'web_archive_bundle.php'; /** - * Filters used to check if words appear in a given generation + * Bloom Filter used by BloomFilterBundle */ require_once 'bloom_filter_file.php'; /** - * + * Used to check if a page already stored in the WebArchiveBundle */ require_once 'bloom_filter_bundle.php'; /** - * - */ -require_once 'gzip_compressor.php'; -/** - * - */ -require_once 'non_compressor.php'; -/** - * + * Used for crawlLog and crawlHash */ require_once 'utility.php'; -/** Loads common constants for web crawling - * +/** + *Loads common constants for web crawling */ require_once 'crawl_constants.php'; @@ -88,7 +80,7 @@ interface IndexingConstants /** * Callback function used to set the offsets into the archive file from - * the paritcular word info in the header block of a WordArchive + * the particular word info in the header block of a WordArchive * * @param array $data * @param array $objects @@ -125,86 +117,137 @@ function setOffsetPointers($data, &$objects, $offset_field) } /** + * Used to iterate through the documents associated with a word in + * an IndexArchiveBundle. It also makes it easy to get the summaries + * of these documents and restrict the documents by additional words. + * + * A description of how words and the documents containing them are stored + * is given in the documentation of IndexArchiveBundle. To iterate over + * all documents containng a word, its hash, work_key, is formed. Then using + * the Bloom filter for that partition, it is determined if the word is stored + * at all, and if it is, which generations it occurs in. Then the iterator + * is set to point to the first block of the first generation the word appears + * in that is greater than the limit of the WordIterator. Thereafter, + * nextDocsWithWord will advance $this->current_pointer by one per call. + * $this->current_pointer keeps track of which block of documents containing + * the word to return. If it is less than COMMON_WORD_THRESHOLD/BLOCK_SIZE and + * there are still more blocks, then the corresponding block_pointer of the word + * from the generation's partition info_block is used to look up the offset to + * the doc block. If it is greater than this value then the linked list + * of doc blocks pointed to for the partition is followed to get the appropriate + * block. This list is in the order that words were stored in the index so + * LIST_OFFSET points to the last block stored, which in turn points to the + * next to last block, etc. Finally, when all the blocks in the linked-list are + * exhausted, the remaining docs for that generation for that word are stored + * in the info block for the word itself (this will always be less than + * BLOCK_SIZE many). Once all the docs for a word for a generation have been + * iterated through, than iteration proceeds to the next generation containing + * the word. * * @author Chris Pollett * @package seek_quarry * @subpackage library + * @see IndexArchiveBundle */ class WordIterator implements IndexingConstants, CrawlConstants { /** - * + * hash of word that the iterator iterates over + * @var string */ var $word_key; /** - * + * The IndexArchiveBundle this index is associated with + * @var object */ var $index; /** - * + * The number of documents already iterated over + * @var int */ var $seen_docs; /** - * + * @var int */ var $restricted_seen_docs; /** - * + * The number of documents in the current block before filtering + * by restricted words + * @var int */ var $count_block_unfiltered; /** - * + * Estimate of the number of documents that this iterator can return + * @var int */ var $num_docs; /** - * + * If iterating through the linked-list portions of the documents + * the next byte offset in the WebArchive based linked-list + * @var int */ var $next_offset; /** - * + * Block number of the last block of docs + * @var int */ var $last_pointed_block; /** - * + * @var int */ var $list_offset; /** - * + * Pointers to offsets for blocks containing docs with the given word + * for the current generation + * @var array */ var $block_pointers; /** - * + * Number of completely full blocks of documents for the current generation + * @var int */ var $num_full_blocks; /** - * + * Number of generations word appears in + * @var int */ var $num_generations; /** - * + * Used to store the contents of the last partially full block + * @var int */ var $last_block; /** - * + * + * @var object */ var $info_block; /** - * + * Stores the number of the current block of documents we are at in the + * set of all blocks of BLOCK_SIZE many documents + * @var int */ var $current_pointer; /** - * + * First document that should be returned + * amongst all of the documents associated with the + * iterator's $word_key + * @var int */ var $limit; /** + * Creates a word iterator with the given parameters. * - * @param string $word_key - * @param object $index - * @param int $limit - * @param object $info_block + * @param string $word_key hash of word or phrase to iterate docs of + * @param object $index the IndexArchiveBundle to use + * @param int $limit the first element to return from the list of docs + * iterated over + * @param object $info_block the info block of the WebArchive + * associated with the word in the index. If NULL, then this will + * loaded in WordIterator::reset() */ public function __construct($word_key, $index, $limit = 0, $info_block = NULL) { @@ -215,8 +258,13 @@ class WordIterator implements IndexingConstants, CrawlConstants } /** + * Returns the iterators to the first document block that it could iterate + * over * - * @param object $info_block + * @param object $info_block the header block in the index WebArchiveBundle + * for the word this iterator iterates over. If not NULL, this saves + * the time to load it. If not it will be loaded, but this will be + * slower. */ public function reset($info_block = NULL) { @@ -265,8 +313,9 @@ class WordIterator implements IndexingConstants, CrawlConstants } /** + * Sets up the iterator to iterate through the current generation. * - * @return bool + * @return bool whether the initialization succeeds */ public function initGeneration() { @@ -318,9 +367,11 @@ class WordIterator implements IndexingConstants, CrawlConstants } /** - * - * @param array $restrict_phrases - * @return array + * Gets the block of doc summaries associated with the current doc + * pointer and which match the array of additional word restrictions + * @param array $restrict_phrases an array of additional words or phrases + * to see if contained in summary + * @return array doc summaries that match */ public function currentDocsWithWord($restrict_phrases = NULL) { @@ -442,9 +493,12 @@ class WordIterator implements IndexingConstants, CrawlConstants } /** + * Get the current block of doc summaries for the word iterator and advances + * the current pointer to the next block * - * @param array $restrict_phrases - * @return array + * @param array $restrict_phrases additional words to restrict doc summaries + * returned + * @return array doc summaries matching the $restrict_phrases */ public function nextDocsWithWord($restrict_phrases = NULL) { @@ -480,6 +534,43 @@ class WordIterator implements IndexingConstants, CrawlConstants } /** + * Encapsulates a set of web page summaries and an inverted word-index of terms + * from these summaries which allow one to search for summaries containing a + * particular word. + * + * The basic file structures for an IndexArchiveBundle are: + * <ol> + * <li>A WebArchiveBundle for web page summaries.</li> + * <li>A set of WebArchiveBundles for the inverted index. Each such bundle + * is called a <b>generation</b>. These bundles have name index0, index1,... + * The file generations.txt keeps track of what is the current generation + * and how many words have been stored in it. A given generation can + * hold NUM_WORDS_PER_GENERATION words amongst all its partitions. After which + * the next generation begins. In a given generation, a word is stored in + * the partition that its hash key hashes to. The same word may appear in + * several generations. The info block for a partition for a particular + * generation contains objects for each word of the generation that hashed + * to that partition. Each such word object contains a count of the number + * of documents it occurred in for that generation. It also has an + * array of block_pointers to blocks of size BLOCK_SIZE. These blocks contains + * documents that the word occurred in, the score for the occurrence, and + * an offset into the summary file for that document. If the total number of + * documents is not a multiple of BLOCK_SIZE the remaining documents are stored + * directly in the word's info block object. If, in a given generation, a + * word occurs more than COMMON_WORD_THRESHOLD many times then the word object + * uses a LIST_OFFSET pointer to point to a linked list in the partition of + * addtional blocks of documents for that word. + * </li> + * <li>For each partition and for all generations a BloomFilterFile is used + * to keep track of which words appear in which generations for a + * particular partition. These filters are stored in a folder within the + * IndexArchiveBundle called index_filters. When a word and documents + * containing it are stored in an IndexArchiveBundle, its word_key (its has) is + * stored in the filter for the partition its word_key hash to. Further + * if the current generation is i, then work_ket concatenated with i is + * also stored in this same filter.</li> + * </ol> + * * * @author Chris Pollett * @package seek_quarry @@ -487,24 +578,74 @@ class WordIterator implements IndexingConstants, CrawlConstants */ class IndexArchiveBundle implements IndexingConstants, CrawlConstants { + /** + * Used to keep track of the time to perform various operations + * in this IndexArchiveBundle + * @var array + */ var $diagnostics; + /** + * Folder name to use for this IndexArchiveBundle + * @var string + */ var $dir_name; + /** + * A short text name for this IndexArchiveBundle + * @var string + */ var $description; + /** + * Number of partitions in the summaries WebArchiveBundle + * @int + */ var $num_partitions_summaries; + /** + * Number of partitions in the inverted word index + * (same for each generation) + * @int + */ var $num_partitions_index; + /** + * structure contains info about the current generation: + * its index (ACTIVE), and the number of words it contains + * (NUM_WORDS). + * @array + */ var $generation_info; + /** + * Number of words before a new generation is started + * @int + */ var $num_words_per_generation; + /** + * WebArchiveBundle for web page summaries + * @object + */ var $summaries; + /** + * WebArchiveBundle for inverted word index + * @object + */ var $index; + /** + * Bloom Filters used to figure out which words are in which generations for + * given paritions + * @object + */ var $index_partition_filters; /** + * Makes or initializes an IndexArchiveBundle with the provided parameters * - * @param string $dir_name - * @param int $filter_size - * @param int $num_partitions_summaries - * @param int $num_parititions_index - * @param string $description + * @param string $dir_name folder name to store this bundle + * @param int $filter_size size of a Bloom filter for the word index + * partition filters as wells as for the page_exists_filters in + * the WebArchiveBundles + * @param int $num_partitions_summaries number of WebArchive partitions + * to use in the summmaries WebArchiveBundle + * @param int $num_partitions_index number of WebArchive partitions + * to use in the index WebArchiveBundle + * @param string $description a short text name for this IndexArchiveBundle */ public function __construct($dir_name, $filter_size = -1, $num_partitions_summaries = NULL, $num_partitions_index = NULL, @@ -546,11 +687,14 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } /** + * Add the array of $pages to the summaries WebArchiveBundle pages being + * stored in the partition according to the $key_field and the field used + * to store the resulting offsets given by $offset_field. * - * @param string $key_field - * @param string $offset_field - * @param array $pages - * @return array + * @param string $key_field field used to select partition + * @param string $offset_field field used to record offsets after storing + * @param array &$pages data to store + * @return array $pages adjusted with offset field */ public function addPages($key_field, $offset_field, $pages) { @@ -560,8 +704,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } /** + * Adds the provided mini inverted index data to the IndexArchiveBundle * - * @param array $index_data + * @param array $index_data a mini inverted index of word_key=>doc data + * to add to this IndexArchiveBundle */ public function addIndexData($index_data) { @@ -614,10 +760,18 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } /** + * Adds the mini-inverted index data that to a particular partition. + * It is assume the word keys in this data would hash to the destined + * index partitions * - * @param int $partition - * @param array &$word_data - * @param bool $overwrite + * @param int $partition WebArchive in the index WebArchiveBundle of the + * current generation to write to + * @param array &$word_data what to wrtie + * @param bool $overwrite whether to signal that all data in prior + * generations associated with keys that are being inserted should be + * ignored (for instance, multi-word search are partially computed and + * added to the index. If these get recomputed we might want to ignore + * prior work. ) */ public function addPartitionWordData($partition, &$word_data, $overwrite = false) @@ -733,10 +887,11 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } /** + * Adds the provided $word_key to the BloomFilter for the given partition * - * @param int $partition - * @param string $word_key - * @return bool + * @param int $partition whose Bloom Filter we want to add the word_key to + * @param string $word_key the key to add + * @return bool whether the add was successful */ public function addPartitionIndexFilter($partition, $word_key) { @@ -752,9 +907,9 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } /** - * - * @param int $partition - * @return bool + * Initializes or constructs the Bloom filter assocaited with a partition + * @param int $partition index of desired partition + * @return bool whether the operation was successful */ public function initPartitionIndexFilter($partition) { @@ -777,14 +932,18 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } /** - * - * @param string $word_key - * @param int $Lint - * @param int $num - * @param array $restrict_phrases - * @param string $phrase_key - * @param array $phrase_info - * @return array + * Gets doc summaries of documents containing a given word and meeting the + * additional provided criteria + * @param string $word_key the word to iterate over to get document results + * of + * @param int $limit number of first document in order to return + * @param int $num number of documents to return summaries of + * @param array $restrict_phrases additional words and phrase to store + * further restrict the search + * @param string $phrase_key a hash of the word and restricted phrases to + * store the results of the look up + * @param array $phrase_info info block of the word + * @return array document summaries */ public function getSummariesByHash($word_key, $limit, $num, $restrict_phrases = NULL, $phrase_key = NULL, $phrase_info = NULL) @@ -831,10 +990,14 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } /** + * Gets the page out of the summaries WebArchiveBundle with the given + * key and offset * - * @param string $key - * @param int $offset - * @return array + * The $key determines the partition WebArchive, the $offset give the + * byte offset within that archive. + * @param string $key hash to use to look up WebArchive partition + * @param int $offset byte offset in partition of desired page + * @return array desired page */ public function getPage($key, $offset) { @@ -842,11 +1005,16 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } /** + * Returns a block of documents a word occur in. The doc block looked up + * is at a given offset into the word's partition WebArchive for a given + * generation. This is used when the word occurs more the + * COMMON_WORD_THRESHOLD many times in a generation * - * @param string $word_key - * @param int $offset - * @param int $generation - * @return array + * @param string $word_key hash of word whose doc block we are looking up + * @param int $offset byte offset into word's partition WebArchive for the + * supplied generation + * @param int $generation which generation to look up the doc block of + * @return array the desired doc block */ public function getWordDocBlock($word_key, $offset, $generation = -1) { @@ -860,11 +1028,14 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } /** + * Gets a page using in WebArchive $partition of the word index + * using the provided byte $offset and using existing $file_handle + * if possible. * - * @param int $partition - * @param int $offset - * @param resource $file_handle - * @return array + * @param int $partition which WebArchive to look in + * @param int $offset byte offset of page data + * @param resource $file_handle file handle resource of $partition archive + * @return array desired page */ public function getPageByPartition($partition, $offset, $file_handle = NULL) { @@ -873,9 +1044,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } /** + * Adds the given summary to the summary exists filter bundle * - * @param string $key_field - * @param array $page + * @param string $key_field field of page with hash of page content + * @param array $page summary of page */ public function addPageFilter($key_field, $page) { @@ -883,7 +1055,13 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } /** + * Looks at the $field key of elements of pages and computes an array + * consisting of $field values which are not in + * the page_exists_filter_bundle of the summaries bundle * + * @param array $pages set of page data to start from + * @param string $key_field field to check against filter bundle + * @return mixed false if filter empty; desired array otherwise */ public function differenceContainsPages(&$page_array, $field_name = NULL) { @@ -892,13 +1070,14 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } /** - * + * Forces the data in the page exists filter bundle of summaries + * to be save to disk, forces each index partition summary to be saved */ public function forceSave() { $this->summaries->forceSave(); for($i = 0; $i < $this->num_partitions_index; $i++) { - if($this->index_partition_filters[$i] && + if(isset($this->index_partition_filters[$i]) && $this->index_partition_filters[$i] != NULL) { $this->index_partition_filters[$i]->save(); } @@ -906,11 +1085,16 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } /** + * Computes statistics for the provided phrase_key. + * These include an estimate of the total number of documents it occurs in, + * as well as which generations it occurs in, and what are its info block + * looks like in the current generation * - * @param string $phrase_key - * @param int $generation_index - * @param array $info_block - * @return array + * @param string $phrase_key what to compute statistics for + * @param int $generation_index the current generation + * @param array $info_block info_block of the phrase_key (will look up + * if not provided) + * @return array info for this $phrase_key */ public function getPhraseIndexInfo( $phrase_key, $generation_index = 0, $info_block = NULL) @@ -986,6 +1170,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } /** + * Sets the information associated with a word in the inverted index * * @param string $phrase_key * @param array $info @@ -1008,11 +1193,17 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } /** + * Adds the supplied phrase to the IndexArchiveBundle. * - * @param string $word_key - * @param array $restrict_phrases - * @param string $phrase_key - * @param $num_needed + * The most selective word in the phrase is $word_key, the additional + * words are in $restrict_phrases, the hash of the phrase to add is + * $phrase_key, and if the will be a lot of results compute at least + * the first $num_needed. + * + * @param string $word_key hash of most selective word in phrase + * @param array $restrict_phrases additional words in phrase + * @param string $phrase_key hash of phrase to add + * @param $num_needed minimum number of doc results to save if possible */ public function addPhraseIndex($word_key, $restrict_phrases, $phrase_key, $num_needed) @@ -1082,11 +1273,12 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } /** + * Computes the words which appear in the fewest or most documents * - * @param array $word_keys - * @param int $num - * @param string $comparison - * @return array + * @param array $word_keys keys of words to select amongst + * @param int $num number of words from the above set to return + * @param string $comparison callback function name for how to compare words + * @return array the $num most documents or $num least document words */ public function getSelectiveWords($word_keys, $num, $comparison="lessThan") //lessThan is in utility.php @@ -1109,10 +1301,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } /** + * Reads the info block of $partition index WebArchive * - * @param int $partition - * @param int $generation - * @return array + * @param int $partition WebArchive to read from + * @return array data in its info block */ public function readPartitionInfoBlock($partition, $generation = -1) { @@ -1127,9 +1319,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } /** + * Write $data into the info block of the $partition index WebArchive * - * @param int $partition - * @param array $data + * @param int $partition WebArchive to write into + * @param array $data what to write */ public function writePartitionInfoBlock($partition, $data) { @@ -1137,7 +1330,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } /** - * Gets teh description, count of summaries, and number of partions of the + * Gets the description, count of summaries, and number of partions of the * summaries store in the supplied directory * * @param string path to a directory containing a summaries WebArchiveBundle diff --git a/lib/priority_queue.php b/lib/priority_queue.php index 763863c3d..d389a58b0 100755 --- a/lib/priority_queue.php +++ b/lib/priority_queue.php @@ -52,8 +52,9 @@ require_once "crawl_constants.php"; /** * - * Code used to manage a memory efficient priority queue - * Weights for the queue must be flaots + * Code used to manage a memory efficient priority queue. + * Weights for the queue must be flaots. The queue itself is + * implemented using heaps * * @author Chris Pollett * @@ -97,7 +98,8 @@ class PriorityQueue extends StringArray implements CrawlConstants var $notifier; // who to call if move an item in queue /** - * Makes a priority queue with the given operating parameters + * Makes a priority queue (implemented as an array heap) with the given + * operating parameters * * @param string $fname filename to store the data associated with the queue * @param int $num_values number of values the queue can hold @@ -129,7 +131,7 @@ class PriorityQueue extends StringArray implements CrawlConstants * Gets the data stored at the ith location in the priority queue * * @param int $i location to return data from - * @return mixed data if the value of $i is between 1 and count, false + * @return mixed array data if the value of $i is between 1 and count, false * otherwise */ function peek($i = 1) @@ -142,9 +144,16 @@ class PriorityQueue extends StringArray implements CrawlConstants } /** + * Removes and returns the ith element out of the Priority queue. + * Since this is a priority queue the first element in the queue + * will either be the min or max (depending on queue type) element + * stored. If $i is not in range an error message is written to the log. + * This operation also performs a check to see if the queue should be + * saved to disk * - * @param int $i - * @return mixed + * @param int $i element to get out of the queue + * @return mixed array data if the value of $i is between 1 and count, false + * otherwise */ function poll($i = 1) { @@ -165,9 +174,12 @@ class PriorityQueue extends StringArray implements CrawlConstants } /** - * @param string $data - * @param float $weight - * @return mixed + * Inserts a new item into the priority queue. + * + * @param string $data what to insert into the queue + * @param float $weight how much the new data should be weighted + * @return mixed index location in queue where item was stored if + * successful, otherwise false. */ function insert($data, $weight) { @@ -185,9 +197,11 @@ class PriorityQueue extends StringArray implements CrawlConstants } /** + * Add $delta to the $ith element in the priority queue and then adjusts + * the queue to store the heap property * - * @param int $i - * @param float $delta + * @param int $i element whose weight should be adjusted + * @param float $delta how much to change the weight by */ function adjustWeight($i, $delta) { @@ -217,7 +231,8 @@ class PriorityQueue extends StringArray implements CrawlConstants } /** - * + * Pretty prints the contents of the queue viewed as an array. + * */ function printContents() { @@ -228,8 +243,10 @@ class PriorityQueue extends StringArray implements CrawlConstants } /** + * Return the contents of the priority queue as an array of + * value weight pairs. * - * @return array + * @return array contents of the queue */ function getContents() { @@ -242,8 +259,14 @@ class PriorityQueue extends StringArray implements CrawlConstants } /** + * Scaless the weights of elements in the queue so that the sum fo the new + * weights is $new_total * - * @param int $new_total + * This function is used periodically to prevent the queue from being + * gummed up because all of the weights stored in it are too small. + * + * @param int $new_total what the new sum of weights of elements in the + * queue will be after normalization */ function normalize($new_total = NUM_URLS_QUEUE_RAM) { @@ -267,9 +290,13 @@ class PriorityQueue extends StringArray implements CrawlConstants } /** + * If the $ith element in the PriorityQueue violates the heap + * property with its parent node (children should be of lower + * priority than the parent), this function + * tries modify the heap to restore the heap property. * - * @param int $i - * @return int + * @param int $i node to consider in restoring the heap property + * @return int final position $ith node ends up at */ function percolateUp($i) { @@ -297,9 +324,12 @@ class PriorityQueue extends StringArray implements CrawlConstants } /** + * If the ith element in the PriorityQueue violates the heap + * property with some child node (children should be of lower + * priority than the parent), this function + * tries modify the heap to restore the heap property. * - * @param int $i - * @return int + * @param int $i node to consider in restoring the heap property */ function percolateDown($i) { @@ -339,10 +369,14 @@ class PriorityQueue extends StringArray implements CrawlConstants } /** + * Computes the difference of the two values $value1 and $value2 + * + * Which is subtracted from which is determined by whether this is + * a min_or_max priority queue * - * @param float $value1 - * @param float $value2 - * @return float + * @param float $value1 a value to take the difference between + * @param float $value2 the other value + * @return float the differences */ function compare($value1, $value2) { @@ -354,10 +388,11 @@ class PriorityQueue extends StringArray implements CrawlConstants } /** + * Gets the ith element of the PriorityQueue viewed as an array * - * - * @param int $i - * @return array + * @param int $i element to get + * @return array value stored in queue together with its weight as a two + * element array */ function getRow($i) { diff --git a/lib/processors/pdf_processor.php b/lib/processors/pdf_processor.php index b8e6fb415..fa1c61b5d 100755 --- a/lib/processors/pdf_processor.php +++ b/lib/processors/pdf_processor.php @@ -144,9 +144,11 @@ class PdfProcessor extends TextProcessor } /** + * Checks if the PDF object's object dictionary is in a list of types * - * @param string $object_dictionary - * @param array $type_array + * @param string $object_dictionary the object dictionary to check + * @param array $type_array the list of types to check against + * @return whether it is in or not */ static function objectDictionaryHas($object_dictionary, $type_array) { @@ -160,9 +162,9 @@ class PdfProcessor extends TextProcessor } /** - * - * @param string $object_string - * @return string + * Gets the object dictionary portion of the current PDF object + * @param string $object_string represents the contents of a PDF object + * @return string the object dictionary for the object */ static function getObjectDictionary($object_string) { @@ -172,9 +174,10 @@ class PdfProcessor extends TextProcessor } /** + * Gets the object stream portion of the current PDF object * - * @param string $object_stream - * @return string + * @param string $object_stream represents the contents of a PDF object + * @return string the object stream for the object */ static function getObjectStream($object_string) { @@ -185,9 +188,12 @@ class PdfProcessor extends TextProcessor } /** + * Extracts ASCII text from PDF data, getting rid of non printable data, + * square brackets and parenthesis and converting char codes to their + * values. * - * @param string $data - * @return string + * @param string $data source to extract character data from + * @return string extracted text */ static function parseText($data) { @@ -227,10 +233,12 @@ class PdfProcessor extends TextProcessor } /** + * Extracts ASCII text till the next close brackets * - * @param string $data - * @param int $cur_pos - * @return array + * @param string $data source to extract character data from + * @param int $cur_pos position to start in $data + * @return array pair consisting of the final position in $data as well + * as extracted text */ static function parseBrackets($data, $cur_pos) { @@ -268,10 +276,12 @@ class PdfProcessor extends TextProcessor } /** + * Extracts ASCII text till the next close parenthesis * - * @param string $data - * @param int $cur_pos - * @return array + * @param string $data source to extract character data from + * @param int $cur_pos position to start in $data + * @return array pair consisting of the final position in $data as well + * as extracted text */ static function parseParentheses($data, $cur_pos) { @@ -302,7 +312,7 @@ class PdfProcessor extends TextProcessor } $check_positioning = substr($data, $cur_pos, 4); - if(preg_match("/\-\d{3}/", $check_positioning) >0 ) { + if(preg_match("/\-\d{3}/", $check_positioning) > 0 ) { $out .= " "; } diff --git a/lib/processors/rtf_processor.php b/lib/processors/rtf_processor.php index 96994dfb7..3430d588b 100755 --- a/lib/processors/rtf_processor.php +++ b/lib/processors/rtf_processor.php @@ -50,9 +50,15 @@ class RtfProcessor extends TextProcessor { /** - * @param string $page - * @param string $url - * @return array + * Computes a summary based on a rtf string of a document + * + * @param string $page rtf string of a document + * @param string $url location the document came from, not used by + * RTFProcessor at this point. Some of its subclasses override + * this method and use url to produce complete links for + * relative links within a document + * @return array a summary of (title, description,links, and content) of + * the information in $page */ public static function process($page, $url) { @@ -72,9 +78,13 @@ class RtfProcessor extends TextProcessor } /** + * Gets plain text out of an rtf string + * + * Plain text is mainly extracted by getText(), this function does + * some pre and post processing of escape braces and stuff * - * @param string $rtf_string - * @return string + * @param string $rtf_string what to extract plain text out of + * @return string plain texts */ static function extractText($rtf_string) { $rtf_string = preg_replace('/\\\{/',"!ZZBL!", $rtf_string); @@ -92,9 +102,10 @@ class RtfProcessor extends TextProcessor } /** + * Gets plain text out of an rtf string * - * @param string $rtf_string - * @return string + * @param string $rtf_string what to extract plain text out of + * @return string plain texts */ static function getText($rtf_string) { @@ -130,10 +141,11 @@ class RtfProcessor extends TextProcessor } /** + * Gets the contents of the rtf group at the current position in the string * - * @param string $rtf_string - * @param int $cur_pos - * @return string + * @param string $rtf_string data to get rtf group from + * @param int $cur_pos position in $rtf_string at which to get group + * @return string contents of rtf groups */ static function getNextObject($rtf_string, $cur_pos) { diff --git a/lib/web_archive.php b/lib/web_archive.php index 85ffea504..1bb1a2dd7 100755 --- a/lib/web_archive.php +++ b/lib/web_archive.php @@ -51,39 +51,36 @@ class WebArchive { /** - * + * Filename used to store the web archive. + * @var string */ var $filename; /** * + * Current offset into the web archive the iterator for the archive is at + * (at most one iterator / archive -- oh well) + * @var int */ var $iterator_pos; /** - * + * Filter object used to compress/uncompress objects stored in archive + * @var object */ var $compressor; /** - * + * number of item in archive + * @var int */ var $count; /** + * Makes or initializes a WebArchive object using the supplied parameters * - */ - const OPEN_AND_CLOSE = 1; - /** - * - */ - const OPEN = 2; - /** - * - */ - const CLOSE = 3; - /** - * - * @param string $fname - * @param string $compressor - * @param bool $fast_construct + * @param string $fname filename to use to store archive to disk + * @param string $compressor what kind of Compressor object should be + * used to read and write objects in the archive + * @param bool $fast_construct do we read the info block of the web + * archive as part of the constructing process */ function __construct($fname, $compressor, $fast_construct = false) { @@ -104,8 +101,11 @@ class WebArchive } /** - * - * @return array + * Read the info block associated with this web archive. + * The info block is meta data for the archive stored at the end of + * the WebArchive file. The particular meta is up to who is using + * the web archive. + * @return array the contents of the info block */ function readInfoBlock() { @@ -123,9 +123,15 @@ class WebArchive } /** + * Serializes and applies the compressor to an info block and write it at + * the end of the web archive + * The info block is meta data for the archive stored at the end of + * the WebArchive file. The particular meta is up to who is using + * the web archive. * - * @param resource $fh - * @param array &$data + * @param resource $fh resource for the web archive file. If null + * the web archive is open first and close when the data is written + * @param array &$data data to write into the info block of the archive */ function writeInfoBlock($fh = NULL, &$data = NULL) { @@ -153,9 +159,12 @@ class WebArchive } /** + * Seeks in the WebArchive file to the end of the last Object. + * + * The last 4 bytes of a WebArchive say the length of an info block in bytes * - * @param resource $fh - * @return int + * @param resource $fh resource for the WebArchive file + * @return int offset length of info block */ function seekEndObjects($fh) { @@ -168,13 +177,20 @@ class WebArchive } /** + * Adds objects to the WebArchive * - * @param string $offset_field - * @param array &$objects - * @param array $data - * @param string $callback - * @param bool $return_flag - * @return mixed + * @param string $offset_field field in objects to return the byte offset + * at which they were stored + * @param array &$objects references to objects that will be stored + * the offset field in these references will be adjusted if + * @param array $data data to write in the WebArchive's info block + * @param string $callback name of a callback + * $callback($data, $new_objects, $offset_field) + * used to modify $data before it is written + * to the info block. For instance, we can add offset info to data. + * @param bool $return_flag if true rather than adjust the offsets by + * reference, create copy objects and adjust their offsets anf return + * @return mixed adjusted objects or void */ function addObjects($offset_field, &$objects, $data = NULL, $callback = NULL, $return_flag = true) @@ -224,9 +240,10 @@ class WebArchive } /** + * Open the web archive file associated with this WebArchive object. * - * @param string $mode - * @return resource + * @param string $mode read/write mode to open file with + * @return resource a file resource for the web archive */ function open($mode = "r") { @@ -243,12 +260,17 @@ class WebArchive } /** + * Gets $num many objects out of the web archive starting at byte $offset + * + * If the $next_flag is true the archive iterator is advance and if $fh + * is not NULL then it is assumed to be an open resource pointing to the + * archive (saving the time to open it). * - * @param int $offset - * @param int $num - * @param bool $next_flag - * @param resource $fh - * @return array + * @param int $offset a valid byte offset into a web archive + * @param int $num number of objects tot return + * @param bool $next_flag whether to advance the archive iterator + * @param resource $fh either NULL or a file resource to the archive + * @return array the $num objects beginning at $offset */ function getObjects($offset, $num, $next_flag = true, $fh = NULL) { diff --git a/lib/web_archive_bundle.php b/lib/web_archive_bundle.php index 9f77ca1e8..4f56f6b77 100755 --- a/lib/web_archive_bundle.php +++ b/lib/web_archive_bundle.php @@ -34,20 +34,20 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** - * A WebArchiveBundle is a collection of WebArchive, so load definition of + * A WebArchiveBundle is a collection of WebArchive's, so load definition of * web archive */ require_once 'web_archive.php'; /** - * + * Bloom Filter used by BloomFilterBundle */ require_once 'bloom_filter_file.php'; /** - * + * Used to check if a page already stored in the WebArchiveBundle */ require_once 'bloom_filter_bundle.php'; /** - * + * Used to compress data stored in WebArchiveBundle */ require_once 'gzip_compressor.php'; @@ -58,7 +58,7 @@ require_once 'gzip_compressor.php'; * together.It is useful to split data across several archive files rather than * just store it in one, for both read efficiency and to keep filesizes from * getting too big. In some places we are using 4 byte int's to store file - * offset which restricts the size of the files we can use for wbe archives. + * offsets which restricts the size of the files we can use for wbe archives. * * @author Chris Pollett * @@ -69,45 +69,61 @@ class WebArchiveBundle { /** - * + * Folder name to use for this WebArchiveBundle + * @var string */ var $dir_name; /** - * + * Maximum allowed insert into each BloomFilterFile in the + * page_exists_filter_bundle + * @var int */ var $filter_size; /** - * + * Used to contain the WebArchive paritions of the bundle + * @var array */ var $partition = array(); /** - * + * BloomFilterBundle used to keep track of which pages are already in + * WebArchiveBundle + * @var object */ var $page_exists_filter_bundle; /** - * + * Number of WebArchives in the WebArchiveBundle + * @var int */ var $num_partitions; /** - * + * Total number of page objects stored by this WebArchiveBundle + * @var int */ var $count; /** - * + * A short text name for this WebArchiveBundle + * @var string */ var $description; /** - * + * How Compressor object used to compress/uncompress data stored in + * the bundle + * @var object */ var $compressor; /** + * Makes or initializes an existing WebArchiveBundle with the given + * characteristics * - * @param string $dir_name - * @param int $filter_size - * @param int $num_partitions - * @param string $description - * @param string $compressor + * @param string $dir_name folder name of the bundle + * @param int $filter_size number of items that can be stored in + * a given BloomFilterFile in the $page_exists_filter_bundle + * @param int $num_partitions number of WebArchive's in this bundle + * @param string $description a short text name/description of this + * WebArchiveBundle + * @param string $compressor the Compressor object used to + * compress/uncompress data stored in the bundle */ function __construct($dir_name, $filter_size = -1, $num_partitions = NULL, $description = NULL, @@ -176,11 +192,14 @@ class WebArchiveBundle } /** + * Add the array of $pages to the WebArchiveBundle pages being stored in + * the partition according to the $key_field and the field used to store + * the resulting offsets given by $offset_field. * - * @param string $key_field - * @param string $offset_field - * @param array &$pages - * @return array + * @param string $key_field field used to select partition + * @param string $offset_field field used to record offsets after storing + * @param array &$pages data to store + * @return array $pages adjusted with offset field */ function addPages($key_field, $offset_field, &$pages) { @@ -222,10 +241,13 @@ class WebArchiveBundle } /** + * Gets the page out of the WebArchiveBundle with the given key and offset * - * @param string $key - * @param int $offset - * @return array + * The $key determines the partition WebArchive, the $offset give the + * byte offset within that archive. + * @param string $key hash to use to look up WebArchive partition + * @param int $offset byte offset in partition of desired page + * @return array desired page */ function getPage($key, $offset) { @@ -236,11 +258,13 @@ class WebArchiveBundle } /** + * Gets a page using in WebArchive $partition using the provided byte + * $offset and using existing $file_handle if possible. * - * @param int $partition - * @param int $offset - * @param resource $file_handle - * @return array + * @param int $partition which WebArchive to look in + * @param int $offset byte offset of page data + * @param resource $file_handle file handle resource of $partition archive + * @return array desired page */ function getPageByPartition($partition, $offset, $file_handle = NULL) { @@ -256,10 +280,11 @@ class WebArchiveBundle } /** + * Adds the given page to the page exists filter bundle * - * @param string $key_field - * @param array &$page - * @return bool + * @param string $key_field field of page with hash of page content + * @param array &$page contents/summary of page + * @return bool whether the add succeeded */ function addPageFilter($key_field, &$page) { @@ -272,14 +297,17 @@ class WebArchiveBundle } /** + * Adds a list of objects to a given WebArchive partition * - * @param string $offset_field - * @param int $partition - * @param array &$objects - * @param array $data - * @param string $callback - * @param bool $return_flag - * @return mixed + * @param string $offset_field field used to store offsets after the + * addition + * @param int $partition WebArchive index to store data into + * @param array &$objects objects to store + * @param array $data info header data to write + * @param string $callback function name of function to call as each + * object is stored. Can be used to save offset into $data + * @param bool $return_flag whether to return modified $objects or not + * @return mixed adjusted objects or void */ function addObjectsPartition($offset_field, $partition, &$objects, $data = NULL, $callback = NULL, $return_flag = true) @@ -292,9 +320,10 @@ class WebArchiveBundle } /** + * Reads the info block of $partition WebArchive * - * @param int $partition - * @return array + * @param int $partition WebArchive to read from + * @return array data in its info block */ function readPartitionInfoBlock($partition) { @@ -302,9 +331,10 @@ class WebArchiveBundle } /** + * Write $data into the info block of the $partition WebArchive * - * @param int $partition - * @param array $data + * @param int $partition WebArchive to write into + * @param array $data what to write */ function writePartitionInfoBlock($partition, &$data) { @@ -312,10 +342,13 @@ class WebArchiveBundle } /** + * Looks at the $key_field key of elements of pages and computes an array + * consisting of $key_field values which are not in + * the page_exists_filter_bundle * - * @param array $pages - * @param string $key_field - * @return mixed + * @param array $pages set of page data to start from + * @param string $key_field field to check against filter bundle + * @return mixed false if filter empty; desired array otherwise */ function differencePageKeysFilter($pages, $key_field) { @@ -334,9 +367,11 @@ class WebArchiveBundle } /** + * Looks at the field_name key of elements of page_array and removes any + * of these which are in the page_exists_filter_bundle * - * @param array &$page_array - * @param string $field_name + * @param array &$page_array array to element remove elements from + * @param string $field_name field to check against filter bundle */ function differencePagesFilter(&$page_array, $field_name = NULL) { @@ -345,7 +380,7 @@ class WebArchiveBundle } /** - * + * Forces the data in the page exists filter bundle to be save to disk */ function forceSave() { @@ -355,9 +390,14 @@ class WebArchiveBundle } /** + * Gets an object encapsulating the $index th WebArchive partition in + * this bundle. * - * @param int $index - * @param bool $fast_construct + * @param int $index the number of the partition within this bundle to + * return + * @param bool $fast_construct should the constructor of the WebArchive + * avoid reading in its info block. + * @return object the WebArchive file which was requested */ function getPartition($index, $fast_construct = true) { @@ -379,8 +419,10 @@ class WebArchiveBundle } /** + * Updates the description file with the current count for the number of + * items in the WebArchiveBundle * - * @param int $num + * @param int $num number of items to add to current count */ function addCount($num) { @@ -391,9 +433,14 @@ class WebArchiveBundle } /** + * Gets information about a WebArchiveBundle out of its description.txt + * file * - * @param string $dir_name - * @return array + * @param string $dir_name folder name of the WebArchiveBundle to get info + * for + * @return array containing the name (description) of the WebArchiveBundle, + * the number of items stored in it, and the number of WebArchive + * file partitions it uses. */ static function getArchiveInfo($dir_name) { @@ -413,10 +460,12 @@ class WebArchiveBundle } /** + * Hashes $value to a WebArchive partition it should be read/written to, + * if a bundle has $num_partitions partitions. * - * @param string $value - * @param int $num_partitions - * @return int + * @param string $value item to hash + * @param int $num_partitions number of partitions + * @return int which partition $value should be written to/read from */ static function selectPartition($value, $num_partitions) { diff --git a/lib/web_queue_bundle.php b/lib/web_queue_bundle.php index 20eb54ef3..efb06bd13 100755 --- a/lib/web_queue_bundle.php +++ b/lib/web_queue_bundle.php @@ -64,8 +64,14 @@ require_once 'web_archive.php'; require_once 'utility.php'; /** - * Encapsulates the data structures needed to have a queue of urls to crawl - * next + * Encapsulates the data structures needed to have a queue of to crawl urls + * + * <pre> + * (hash of url, weights) are stored in a PriorityQueue, + * (hash of url, index in PriorityQueue, offset of url in WebArchive) is stored + * in a HashTable + * urls are stored in a WebArchive in an uncompressed format + * </pre> * * @author Chris Pollett * @@ -81,74 +87,93 @@ class WebQueueBundle implements Notifier */ var $dir_name; /** - * + * Number items that can be stored in a partition of the page exists filter + * bundle * @var int */ var $filter_size; /** - * + * number of entries the priority queue used by this web queue bundle + * can store * @var int */ var $num_urls_ram; /** - * + * whether polling the first element of the priority queue returns the + * smallest or largest weighted element. This is set to a constant specified + * in PriorityQueue * @var int */ var $min_or_max; /** - * + * the PriorityQueue used by this WebQueueBundle * @var object */ var $to_crawl_queue; /** - * + * the HashTable used by this WebQueueBundle * @var object */ var $to_crawl_table; /** - * + * Current count of the number of non-read operation performed on the + * WebQueueBundles's hash table since the last time it was rebuilt. * @var int */ var $hash_rebuild_count; /** - * + * Number of non-read operations on the hash table before it needs to be + * rebuilt. * @var int */ var $max_hash_ops_before_rebuild; /** - * + * WebArchive used to store urls that are to be crawled * @var object */ var $to_crawl_archive; + /** + * BloomFilter used to keep track of which urls we've already seen + * @var object + */ var $url_exists_filter_bundle; /** - * + * BloomFilter used to store which hosts whose robots.txt file we + * have already download * @var object */ var $got_robottxt_filter; /** - * + * BloomFilter used to store dissallowed to crawl host paths * @var object */ var $dissallowed_robot_filter; /** - * + * BloomFilter used to keep track of crawl delay in seconds for a given + * host * @var object */ var $crawl_delay_filter; /** - * + * The largest offset for the url WebArchive before we rebuild it. + * Entries are never deleted from the url WebArchive even if they are + * deleted from the priority queue. So when we pass this value we + * make a new WebArchive containing only those urls which are still in + * the queue. */ const max_url_archive_offset = 1000000000; /** + * Makes a WebQueueBundle with the provided parameters * - * @param string $dir_name - * @param int $filter_size - * @param int $num_urls_ram - * @param string $min_or_max + * @param string $dir_name folder name used by this WebQueueBundle + * @param int $filter_size size of each partition in the page exists + * BloomFilterBundle + * @param int $num_urls_ram number of entries in ram for the priority queue + * @param string $min_or_max when the priority queue maintain the heap + * property with respect to the least or the largest weight */ function __construct($dir_name, $filter_size, $num_urls_ram, $min_or_max) @@ -234,8 +259,9 @@ class WebQueueBundle implements Notifier } /** + * Adds an array of (url, weight) pairs to the WebQueueBundle. * - * @param array $url_paris + * @param array $url_pairs a list of pairs to add */ function addUrlsQueue(&$url_pairs) { @@ -277,8 +303,9 @@ class WebQueueBundle implements Notifier } /** - * - * @param string $url + * Check is the url queue already contains the given url + * @param string $url what to check + * @return bool whether it is contained in the queue yet or not */ function containsUrlQueue(&$url) { @@ -288,9 +315,16 @@ class WebQueueBundle implements Notifier } /** + * Adjusts the weight of the given url in the priority queue by amount delta * - * @param string $url - * @param float $delta + * In a page importance crawl. a given web page casts its votes on who + * to crawl next by splitting its crawl money amongst its child links. + * This entails a mechanism for adusting weights of elements in the + * priority queue periodically is necessary. This function is used to + * solve this problem. + * + * @param string $url url whose weight in queue we want to adjust + * @param float $delta change in weight (usually positive). */ function adjustQueueWeight(&$url, $delta) { @@ -308,8 +342,13 @@ class WebQueueBundle implements Notifier } /** + * Removes a url from the priority queue. + * + * This method would typical be called during a crawl after the given + * url is scheduled to be crawled. It only deletes the item from + * the bundles priority queue and hash table -- not from the web archive. * - * @param string $url + * @param string $url the url to delete */ function removeQueue($url) { @@ -331,10 +370,10 @@ class WebQueueBundle implements Notifier } /** - * - * @param int $i - * @param resource $fh - * @return mixed + * Gets the url and weight of the ith entry in the priority queue + * @param int $i entry to look up + * @param resource $fh a file handle to the WebArchive for urls + * @return mixed false on error, otherwise the ordered pair in an array */ function peekQueue($i = 1, $fh = NULL) { @@ -365,7 +404,7 @@ class WebQueueBundle implements Notifier } /** - * + * Pretty prints the contents of the queue bundle in order */ function printContents() { @@ -378,8 +417,9 @@ class WebQueueBundle implements Notifier } /** - * - * @return array + * Gets the contents of the queue bundle as an array of ordered url,weight + * pairs + * @return array a list of ordered url, wight pairs */ function getContents() { @@ -392,8 +432,9 @@ class WebQueueBundle implements Notifier } /** - * - * @param int $new_total + * Makes the weight sum of the to-crawl priority queue sum to $new_total + * @param int $new_total amount weights should sum to. All weights will be + * scaled by the same factor. */ function normalize($new_total = NUM_URLS_QUEUE_RAM) { @@ -403,9 +444,10 @@ class WebQueueBundle implements Notifier //Filter and Filter Bundle Methods /** - * - * @param string $mode - * @return resource + * Opens the url WebArchive associated with this queue bundle in the + * given read/write mode + * @param string $mode the read/write mode to open the archive with + * @return resource a file handle to the WebArchive file */ function openUrlArchive($mode = "r") { @@ -413,8 +455,8 @@ class WebQueueBundle implements Notifier } /** - * - * @param resource $fh + * Closes a file handle to the url WebArchive + * @param resource $fh a valid handle to the url WebArchive file */ function closeUrlArchive($fh) { @@ -422,8 +464,8 @@ class WebQueueBundle implements Notifier } /** - * - * @param string $url + * Adds the supplied url to the url_exists_filter_bundle + * @param string $url url to add */ function addSeenUrlFilter($url) { @@ -431,9 +473,10 @@ class WebQueueBundle implements Notifier } /** - * - * @param array &$url_array - * @param string $field_name + * Removes all url objects from $url_array which have been seen + * @param array &$url_array objects to check if have been seen + * @param string $field_name component of a url_array element which + * contains a url to check if seen */ function differenceSeenUrls(&$url_array, $field_name = NULL) { @@ -442,8 +485,8 @@ class WebQueueBundle implements Notifier } /** - * - * @param string $host + * Adds the supplied $host to the got_robottxt_filter + * @param string $host url to add */ function addGotRobotTxtFilter($host) { @@ -451,9 +494,9 @@ class WebQueueBundle implements Notifier } /** - * - * @param string $host - * @return bool + * Checks if we have a fresh copy of robots.txt info for $host + * @param string $host url to check + * @return bool whether we do or not */ function containsGotRobotTxt($host) { @@ -461,18 +504,20 @@ class WebQueueBundle implements Notifier } /** - * - * @param string $host + * Adds a new entry to the disallowed robot host path Bloom filter + * @param string $host_path the path on host that is excluded. For example + * http://somewhere.com/bob disallows bob on somewhere.com */ - function addDisallowedRobotFilter($host) + function addDisallowedRobotFilter($host_path) { - $this->dissallowed_robot_filter->add($host); + $this->dissallowed_robot_filter->add($host_path); } /** - * - * @param string $host_path - * @return bool + * Checks if the given $host_path is disallowed by the host's + * robots.txt info. + * @param string $host_path host path to check + * @return bool whether it was disallowed or nots */ function containsDisallowedRobot($host_path) { @@ -480,8 +525,9 @@ class WebQueueBundle implements Notifier } /** - * - * @return int + * Gets when the timestamp of the oldest robot data still stored in + * the queue bundle + * @return int a Unix timestamp */ function getRobotTxtAge() { @@ -493,9 +539,10 @@ class WebQueueBundle implements Notifier } /** + * Sets the Crawl-delay of $host to passes $value in seconds * - * @param string $host - * @param int $value + * @param string $host a host to set the Crawl-delay for + * @param int $value a delay in seconds up to 255 */ function setCrawlDelay($host, $value) { @@ -511,9 +558,10 @@ class WebQueueBundle implements Notifier } /** + * Gets the Crawl-delay of $host from the crawl delay bloom filter * - * @param string $host - * @return int + * @param string $host site to check for a Crawl-delay + * @return int the crawl-delay in seconds or -1 if $host has no delay */ function getCrawlDelay($host) { @@ -532,10 +580,15 @@ class WebQueueBundle implements Notifier } /** + * Mainly, a Factory style wrapper around the HashTable's constructor. + * However, this function also sets up a rebuild frequency. It is used + * as part of the process of keeping the to crawl table from having too + * many entries * - * @param string $name - * @param int $num_values - * @return object + * @param string $name filename to store the hash table persistently + * @param int $num_values size of HashTable's arraya + * @return object the newly built hash table + * @see rebuildHashTable() */ function constructHashTable($name, $num_values) { @@ -545,9 +598,11 @@ class WebQueueBundle implements Notifier } /** + * Looks up $key in the to-crawl hash table * - * @param string $key - * @return string + * @param string $key the things to look up + * @return mixed would be string if the value is being returned, + * otherwise, false if the key is not found */ function lookupHashTable($key) { @@ -555,12 +610,12 @@ class WebQueueBundle implements Notifier } /** - * - * @param string $value + * Removes an entries from the to crawl hash table + * @param string $key usually a hash of a url */ - function deleteHashTable($value) + function deleteHashTable($key) { - $this->to_crawl_table->delete($value); + $this->to_crawl_table->delete($key); $this->hash_rebuild_count++; if($this->hash_rebuild_count > $this->max_hash_ops_before_rebuild) { $this->rebuildHashTable(); @@ -568,10 +623,12 @@ class WebQueueBundle implements Notifier } /** + * Inserts the $key, $value pair into this web queue's to crawl table * - * @param string $key - * @param string $value - * @return bool + * @param string $key intended to be a hash of a url + * @param string $value intended to be offset into a webarchive for urls + * together with an index into the priority queue + * @return bool whether the insert was a success or not */ function insertHashTable($key, $value) { @@ -583,7 +640,21 @@ class WebQueueBundle implements Notifier } /** + * Makes a new HashTable without deleted rows * + * The hash table in Yioop is implemented using open addressing. i.e., + * We store key value pair in the table itself and if there is a collision + * we look for the next available slot. Two codes are use to indicate + * space available in the table. One to indicate empty never used, the + * other used to indicate empty but previously used and deleted. The reason + * you need two codes is to ensure that if somebody inserted an item B, + * it hashes to the same value as A and we move to the next empty slot, + * to store B, then if we delete A we should still be able to lookup B. + * The problem is as the table gets reused a lot, it tends to fill up + * with a lot of deleted entries making lookup times get more and more + * linear in the hash table size. By rebuilding the table we mitigate + * against this problem. By choosing the rebuild frequecy appropriately, + * the amortized cost of this operation is only O(1). */ function rebuildHashTable() { @@ -688,9 +759,13 @@ class WebQueueBundle implements Notifier } /** + * Callback which is called when an item in the priority queue changes + * position. The position is updated in the hash table. + * The priority queue stores (hash of url, weight). The hash table + * stores (hash of url, web_archive offset to url, index priority queue). * - * @param int $index - * @param array $data + * @param int $index new index in priority queue + * @param array $data (hash url, weight) */ function notify($index, $data) {